diff --git a/.gitignore b/.gitignore index 39e393db..bcbce7e8 100644 --- a/.gitignore +++ b/.gitignore @@ -139,3 +139,6 @@ rever/ # Generated requirements.txt and uv lock file requirements.txt uv.lock + +# OS-specific files +.DS_store \ No newline at end of file diff --git a/docs/api/competition.dataset.md b/docs/api/competition.dataset.md deleted file mode 100644 index 8018af79..00000000 --- a/docs/api/competition.dataset.md +++ /dev/null @@ -1,5 +0,0 @@ -::: polaris.dataset.CompetitionDataset - options: - filters: ["!^_"] - ---- \ No newline at end of file diff --git a/docs/api/competition.evaluation.md b/docs/api/competition.evaluation.md index 2a66956f..79ace6f8 100644 --- a/docs/api/competition.evaluation.md +++ b/docs/api/competition.evaluation.md @@ -1,7 +1 @@ ::: polaris.evaluate.CompetitionPredictions - ---- - -::: polaris.evaluate.CompetitionResults - ---- \ No newline at end of file diff --git a/docs/api/dataset.md b/docs/api/dataset.md index 2b3cb7c4..225d6390 100644 --- a/docs/api/dataset.md +++ b/docs/api/dataset.md @@ -4,6 +4,12 @@ --- +::: polaris.dataset.DatasetV2 + options: + filters: ["!^_"] + +--- + ::: polaris.dataset._base.BaseDataset options: filters: ["!^_"] @@ -20,4 +26,4 @@ options: filters: ["!^_"] ---- \ No newline at end of file +--- diff --git a/docs/tutorials/competition.participate.ipynb b/docs/tutorials/competition.participate.ipynb index 1301e1e9..f31f1bd4 100644 --- a/docs/tutorials/competition.participate.ipynb +++ b/docs/tutorials/competition.participate.ipynb @@ -45,18 +45,10 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": null, "id": "9b465ea4-7c71-443b-9908-3f9e567ee4c4", "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\u001b[32m2024-08-09 18:05:23.205\u001b[0m | \u001b[32m\u001b[1mSUCCESS \u001b[0m | \u001b[36mpolaris.hub.client\u001b[0m:\u001b[36mlogin\u001b[0m:\u001b[36m267\u001b[0m - \u001b[32m\u001b[1mYou are successfully logged in to the Polaris Hub.\u001b[0m\n" - ] - } - ], + "outputs": [], "source": [ "import polaris as po\n", "from polaris.hub.client import PolarisHubClient\n", @@ -98,7 +90,7 @@ "The Polaris library is designed to make it easy to participate in a competition. In just a few lines of code, we can get the train and test partition, access the associated data in various ways and evaluate our predictions. There's two main API endpoints. \n", "\n", "- `get_train_test_split()`: For creating objects through which we can access the different dataset partitions.\n", - "- `evaluate()`: For evaluating a set of predictions in accordance with the competition protocol." + "- `submit_predictions()`: For submitting the predictions to an active competition." ] }, { @@ -116,7 +108,7 @@ "id": "e78bf878", "metadata": {}, "source": [ - "The created test and train objects support various flavours to access the data." + "Similar to benchmarks, the created test and train objects support various flavours to access the data." ] }, { @@ -134,7 +126,8 @@ "for i in range(len(train)):\n", " x, y = train[i]\n", "\n", - "# The objects have properties to access all data at once\n", + "# The objects have properties to access all data at once. Use this with\n", + "# caution if the underlying dataset is large!\n", "x = train.inputs\n", "y = train.targets" ] @@ -144,7 +137,7 @@ "id": "5ec12825", "metadata": {}, "source": [ - "Now, let's create some predictions against the official Polaris `hello-world-competition`. We will train a simple random forest model on the ECFP representation through scikit-learn and datamol, and then we will submit our results for secure evaluation by the Polaris Hub." + "Now, let's create some predictions against the imaginary `hello-world-competition`. Let's assume we train a simple random forest model on the ECFP representation through scikit-learn and datamol, and then we submit our results for secure evaluation by the Polaris Hub." ] }, { @@ -160,7 +153,7 @@ "# Load the competition (automatically loads the underlying dataset as well)\n", "competition = po.load_competition(\"polaris/hello-world-benchmark\")\n", "\n", - "# Get the split and convert SMILES to ECFP fingerprints by specifying an featurize function.\n", + "# Get the split and convert SMILES to ECFP fingerprints by specifying a featurize function.\n", "train, test = competition.get_train_test_split(featurization_fn=dm.to_fp)\n", "\n", "# Define a model and train\n", @@ -204,7 +197,7 @@ "id": "5ff06a9c", "metadata": {}, "source": [ - "Once your `CompetitionPredictions` object is created, you're ready to submit them for evaluation! This will automatically save your result to the Polaris Hub, but it will be private. You can choose to make it public through the Polaris web application. " + "Once your `CompetitionPredictions` object is created, you're ready to submit them for evaluation! This will automatically save your result to the Polaris Hub, but it will be private until the competition closes." ] }, { @@ -224,7 +217,7 @@ "id": "44973556", "metadata": {}, "source": [ - "That's it! Just like that you have partaken in your first Polaris competition. Keep an eye on that leaderboard and best of luck in your future competitions!\n", + "That's it! Just like that you have partaken in your first Polaris competition. Keep an eye on that leaderboard when it goes public and best of luck in your future competitions!\n", "\n", "The End.\n", "\n", diff --git a/env.yml b/env.yml index b766802f..7988ccbf 100644 --- a/env.yml +++ b/env.yml @@ -35,7 +35,7 @@ dependencies: - datamol >=0.12.1 # Storage - - zarr + - zarr >=2,<3 - pyarrow # Optional diff --git a/mkdocs.yml b/mkdocs.yml index 79a3a3ca..7a74a5fe 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -24,21 +24,16 @@ nav: - PDB Datasets: tutorials/dataset_pdb.ipynb - SDF Datasets: tutorials/dataset_sdf.ipynb - Optimization: tutorials/optimization.ipynb - # NOTE (cwognum): Competitions are currently gated. - # - Competitions: - # - tutorials/competition.participate.ipynb + - Competitions: + - tutorials/competition.participate.ipynb - API Reference: - Load: api/load.md - Core: - Dataset: api/dataset.md - Benchmark: api/benchmark.md + - Competition: api/competition.md - Subset: api/subset.md - Evaluation: api/evaluation.md - # NOTE (cwognum): Competitions are currently gated. - # - Competitions: - # - Competition Dataset: api/competition.dataset.md - # - Competition: api/competition.md - # - Competiton Evaluation: api/competition.evaluation.md - Hub: - Client: api/hub.client.md - External Auth Client: api/hub.external_client.md diff --git a/polaris/__init__.py b/polaris/__init__.py index ddb0f44a..a3854a17 100644 --- a/polaris/__init__.py +++ b/polaris/__init__.py @@ -4,9 +4,9 @@ from loguru import logger from ._version import __version__ -from .loader import load_benchmark, load_dataset +from .loader import load_benchmark, load_dataset, load_competition -__all__ = ["load_dataset", "load_benchmark", "__version__"] +__all__ = ["load_dataset", "load_benchmark", "load_competition", "__version__"] # Configure the default logging level os.environ["LOGURU_LEVEL"] = os.environ.get("LOGURU_LEVEL", "INFO") diff --git a/polaris/benchmark/_base.py b/polaris/benchmark/_base.py index a4a84e1a..6be6ba83 100644 --- a/polaris/benchmark/_base.py +++ b/polaris/benchmark/_base.py @@ -3,54 +3,86 @@ from hashlib import md5 from itertools import chain from pathlib import Path -from typing import Any, Callable, ClassVar, Collection, Literal, Sequence, TypeAlias +from typing import Any, Callable, ClassVar, Literal import fsspec import numpy as np -from loguru import logger from pydantic import ( + BaseModel, Field, computed_field, - field_serializer, field_validator, model_validator, ) -from pydantic_core.core_schema import ValidationInfo from sklearn.utils.multiclass import type_of_target from typing_extensions import Self from polaris._artifact import BaseArtifactModel +from polaris.benchmark._split import SplitSpecificationV1Mixin +from polaris.benchmark._task import PredictiveTaskSpecificationMixin from polaris.dataset import DatasetV1, Subset from polaris.dataset._base import BaseDataset -from polaris.evaluate import BenchmarkResults, Metric +from polaris.evaluate import BenchmarkResults from polaris.evaluate.utils import evaluate_benchmark from polaris.hub.settings import PolarisHubSettings from polaris.mixins import ChecksumMixin from polaris.utils.dict2html import dict2html from polaris.utils.errors import InvalidBenchmarkError -from polaris.utils.misc import listit from polaris.utils.types import ( AccessType, HubOwner, IncomingPredictionsType, - SplitType, TargetType, - TaskType, ) -ColumnName: TypeAlias = str +class BaseSplitSpecificationMixin(BaseModel): + """Base mixin class to add a split field to a benchmark.""" -class BenchmarkSpecification(BaseArtifactModel, abc.ABC): + split: Any + + @property + @abc.abstractmethod + def test_set_sizes(self) -> dict[str, int]: + """The sizes of the test sets.""" + raise NotImplementedError + + @property + @abc.abstractmethod + def n_test_sets(self) -> int: + """The number of test sets""" + raise NotImplementedError + + @property + @abc.abstractmethod + def n_train_datapoints(self) -> int: + """The size of the train set.""" + raise NotImplementedError + + @property + @abc.abstractmethod + def test_set_labels(self) -> list[str]: + """The labels of the test sets.""" + raise NotImplementedError + + @property + @abc.abstractmethod + def n_test_datapoints(self) -> dict[str, int]: + """The size of (each of) the test set(s).""" + raise NotImplementedError + + +class BenchmarkSpecification( + PredictiveTaskSpecificationMixin, BaseArtifactModel, BaseSplitSpecificationMixin, abc.ABC +): """This class wraps a [`Dataset`][polaris.dataset.Dataset] with additional data to specify the evaluation logic. Specifically, it specifies: 1. Which dataset to use (see [`Dataset`][polaris.dataset.Dataset]); - 2. Which columns are used as input and which columns are used as target; - 3. Which metrics should be used to evaluate performance on this task; - 4. A predefined, static train-test split to use during evaluation. + 2. A task definition (we currently only support predictive tasks); + 3. A predefined, static train-test split to use during evaluation. info: Subclasses Polaris includes various subclasses of the `BenchmarkSpecification` that provide a more precise data-model or @@ -85,209 +117,28 @@ class BenchmarkSpecification(BaseArtifactModel, abc.ABC): Attributes: dataset: The dataset the benchmark specification is based on. - target_cols: The column(s) of the original dataset that should be used as target. - input_cols: The column(s) of the original dataset that should be used as input. - metrics: The metrics to use for evaluating performance - main_metric: The main metric used to rank methods. If `None`, the first of the `metrics` field. readme: Markdown text that can be used to provide a formatted description of the benchmark. If using the Polaris Hub, it is worth noting that this field is more easily edited through the Hub UI as it provides a rich text editor for writing markdown. - target_types: A dictionary that maps target columns to their type. If not specified, this is automatically inferred. - For additional meta-data attributes, see the [`BaseArtifactModel`][polaris._artifact.BaseArtifactModel] class. + For additional meta-data attributes, see the base classes. """ _artifact_type = "benchmark" - # Public attributes dataset: BaseDataset = Field(exclude=True) - target_cols: set[ColumnName] = Field(min_length=1) - input_cols: set[ColumnName] = Field(min_length=1) - metrics: set[Metric] = Field(min_length=1) - main_metric: Metric | str readme: str = "" - target_types: dict[ColumnName, TargetType] = Field(default_factory=dict, validate_default=True) - - @field_validator("target_cols", "input_cols", mode="before") - @classmethod - def _parse_cols(cls, v: str | Sequence[str], info: ValidationInfo) -> set[str]: - """ - Normalize columns input values to a set. - """ - if isinstance(v, str): - v = {v} - else: - v = set(v) - return v - - @field_validator("target_types", mode="before") - @classmethod - def _parse_target_types( - cls, v: dict[ColumnName, TargetType | str | None] - ) -> dict[ColumnName, TargetType]: - """ - Converts the target types to TargetType enums if they are strings. - """ - return { - target: TargetType(val) if isinstance(val, str) else val - for target, val in v.items() - if val is not None - } - - @field_validator("metrics", mode="before") - @classmethod - def _validate_metrics(cls, v: str | Metric | Collection[str | Metric]) -> set[Metric]: - """ - Verifies all specified metrics are either a Metric object or a valid metric name. - Also verifies there are no duplicate metrics. - - If there are multiple test sets, it is assumed the same metrics are used across test sets. - """ - if isinstance(v, str): - v = {"label": v} - if not isinstance(v, Collection): - v = [v] - - def _convert(m: str | dict | Metric) -> Metric: - if isinstance(m, str): - return Metric(label=m) - if isinstance(m, dict): - return Metric(**m) - return m - - v = [_convert(m) for m in v] - - unique_metrics = set(v) - - if len(unique_metrics) != len(v): - raise InvalidBenchmarkError("The benchmark specifies duplicate metrics.") - - unique_names = {m.name for m in unique_metrics} - if len(unique_names) != len(unique_metrics): - raise InvalidBenchmarkError( - "The metrics of a benchmark need to have unique names. Specify a custom name with Metric(custom_name=...)" - ) - - return unique_metrics - - @model_validator(mode="after") - def _validate_main_metric_is_in_metrics(self) -> Self: - if isinstance(self.main_metric, str): - for m in self.metrics: - if m.name == self.main_metric: - self.main_metric = m - break - if self.main_metric not in self.metrics: - raise InvalidBenchmarkError("The main metric should be one of the specified metrics") - return self - - @model_validator(mode="after") - def _validate_cols(self) -> Self: - """ - Verifies that all specified columns are present in the dataset. - """ - columns = self.target_cols | self.input_cols - dataset_columns = set(self.dataset.columns) - if not columns.issubset(dataset_columns): - raise InvalidBenchmarkError("Not all specified columns were found in the dataset.") - - return self - - @field_serializer("metrics") - def _serialize_metrics(self, value: set[Metric]) -> list[Metric]: - """ - Convert the set to a list. Since metrics are models and will be converted to dict, - they will not be hashable members of a set. - """ - return list(value) - - @model_validator(mode="after") - def _validate_target_types(self) -> Self: - """ - Verifies that all target types are for benchmark targets. - """ - columns = set(self.target_types.keys()) - if not columns.issubset(self.target_cols): - raise InvalidBenchmarkError( - f"Not all specified target types were found in the target columns. {columns} - {self.target_cols}" - ) - return self - - @field_serializer("main_metric") - def _serialize_main_metric(value: Metric) -> str: - """ - Convert the Metric to it's name - """ - return value.name - - @field_serializer("target_types") - def _serialize_target_types(self, target_types): - """ - Convert from enum to string to make sure it's serializable - """ - return {k: v.value for k, v in target_types.items()} - - @field_serializer("target_cols", "input_cols") - def _serialize_columns(self, v: set[str]) -> list[str]: - return list(v) @computed_field @property def dataset_artifact_id(self) -> str: return self.dataset.artifact_id - @computed_field - @property - def task_type(self) -> str: - """The high-level task type of the benchmark.""" - v = TaskType.MULTI_TASK if len(self.target_cols) > 1 else TaskType.SINGLE_TASK - return v.value - @abc.abstractmethod def _get_test_sets( self, hide_targets=True, featurization_fn: Callable | None = None ) -> dict[str, Subset]: raise NotImplementedError - @property - @abc.abstractmethod - def n_test_datapoints(self) -> dict[str, int]: - """ - The size of (each of) the test set(s). - """ - raise NotImplementedError - - @property - @abc.abstractmethod - def test_set_labels(self) -> list[str]: - """ - The labels of the test sets. - """ - raise NotImplementedError - - @property - @abc.abstractmethod - def n_train_datapoints(self) -> int: - """ - The size of the train set. - """ - raise NotImplementedError - - @property - @abc.abstractmethod - def n_test_sets(self) -> int: - """ - The number of test sets - """ - raise NotImplementedError - - @property - @abc.abstractmethod - def test_set_sizes(self) -> dict[str, int]: - """ - The sizes of the test sets. - """ - raise NotImplementedError - def _get_subset(self, indices, hide_targets=True, featurization_fn=None) -> Subset: """Returns a [`Subset`][polaris.dataset.Subset] using the given indices. Used internally to construct the train and test sets.""" @@ -422,11 +273,10 @@ def __str__(self): return self.__repr__() -class BenchmarkV1Specification(BenchmarkSpecification, ChecksumMixin): +class BenchmarkV1Specification(SplitSpecificationV1Mixin, ChecksumMixin, BenchmarkSpecification): _version: ClassVar[Literal[1]] = 1 dataset: DatasetV1 = Field(exclude=True) - split: SplitType @field_validator("dataset", mode="before") @classmethod @@ -472,74 +322,33 @@ def _infer_target_types(self) -> Self: return self @model_validator(mode="after") - def _validate_split(self) -> Self: + def _validate_split_in_dataset(self) -> Self: + # All indices are valid given the dataset. We check the len of `self` here because a + # competition entity includes both the dataset and benchmark in one artifact. + max_i = len(self.dataset) + if any(i < 0 or i >= max_i for i in chain(self.split[0], *self.split[1].values())): + raise InvalidBenchmarkError("The predefined split contains invalid indices") + + return self + + @model_validator(mode="after") + def _validate_cols_in_dataset(self) -> Self: """ - Verifies that: - 1) There are no empty test partitions - 2) All indices are valid given the dataset - 3) There is no duplicate indices in any of the sets - 4) There is no overlap between the train and test set - 5) No row exists in the test set where all labels are missing/empty + Verifies that all specified columns are present in the dataset. """ - - if not isinstance(self.split[1], dict): - self.split = self.split[0], {"test": self.split[1]} - split = self.split - - # Train partition can be empty (zero-shot) - # Test partitions cannot be empty - if any(len(v) == 0 for v in split[1].values()): - raise InvalidBenchmarkError("The predefined split contains empty test partitions") - - train_idx_list = split[0] - full_test_idx_list = list(chain.from_iterable(split[1].values())) - - if len(train_idx_list) == 0: - logger.info( - "This benchmark only specifies a test set. It will return an empty train set in `get_train_test_split()`" - ) - - train_idx_set = set(train_idx_list) - full_test_idx_set = set(full_test_idx_list) - - # The train and test indices do not overlap - if len(train_idx_set & full_test_idx_set) > 0: - raise InvalidBenchmarkError("The predefined split specifies overlapping train and test sets") - - # Check for duplicate indices within the train set - if len(train_idx_set) != len(train_idx_list): - raise InvalidBenchmarkError("The training set contains duplicate indices") - - # Check for duplicate indices within a given test set. Because a user can specify - # multiple test sets for a given benchmark and it is acceptable for indices to be shared - # across test sets, we check for duplicates in each test set independently. - for test_set_name, test_set_idx_list in split[1].items(): - if len(test_set_idx_list) != len(set(test_set_idx_list)): - raise InvalidBenchmarkError( - f'Test set with name "{test_set_name}" contains duplicate indices' - ) - - # All indices are valid given the dataset - dataset = self.dataset - if dataset is not None: - max_i = len(dataset) - if any(i < 0 or i >= max_i for i in chain(train_idx_list, full_test_idx_set)): - raise InvalidBenchmarkError("The predefined split contains invalid indices") + columns = self.target_cols | self.input_cols + dataset_columns = set(self.dataset.columns) + if not columns.issubset(dataset_columns): + raise InvalidBenchmarkError("Not all target or input columns were found in the dataset.") return self - @field_serializer("split") - def _serialize_split(self, v: SplitType): - """Convert any tuple to list to make sure it's serializable""" - return listit(v) - def _compute_checksum(self) -> str: """ Computes a hash of the benchmark. This is meant to uniquely identify the benchmark and can be used to verify the version. """ - hash_fn = md5() hash_fn.update(self.dataset.md5sum.encode("utf-8")) for c in sorted(self.target_cols): @@ -565,7 +374,8 @@ def _compute_checksum(self) -> str: def _get_test_sets( self, hide_targets=True, featurization_fn: Callable | None = None ) -> dict[str, Subset]: - """Construct the test set(s), given the split in the benchmark specification. Used + """ + Construct the test set(s), given the split in the benchmark specification. Used internally to construct the test set for client use and evaluation. """ test_split = self.split[1] @@ -601,39 +411,6 @@ def get_train_test_split( test = test["test"] return train, test - @computed_field - @property - def test_set_sizes(self) -> dict[str, int]: - """The sizes of the test sets.""" - return {k: len(v) for k, v in self.split[1].items()} - - @computed_field - @property - def n_test_sets(self) -> int: - """The number of test sets""" - return len(self.split[1]) - - @computed_field - @property - def n_train_datapoints(self) -> int: - """The size of the train set.""" - return len(self.split[0]) - - @computed_field - @property - def test_set_labels(self) -> list[str]: - """The labels of the test sets.""" - return sorted(list(self.split[1].keys())) - - @computed_field - @property - def n_test_datapoints(self) -> dict[str, int]: - """The size of (each of) the test set(s).""" - if self.n_test_sets == 1: - return {"test": len(self.split[1])} - else: - return {k: len(v) for k, v in self.split[1].items()} - @computed_field @property def n_classes(self) -> dict[str, int]: diff --git a/polaris/benchmark/_split.py b/polaris/benchmark/_split.py new file mode 100644 index 00000000..cdc40707 --- /dev/null +++ b/polaris/benchmark/_split.py @@ -0,0 +1,114 @@ +from itertools import chain + +from loguru import logger +from pydantic import BaseModel, computed_field, field_serializer, model_validator +from typing_extensions import Self + +from polaris.utils.errors import InvalidBenchmarkError +from polaris.utils.misc import listit +from polaris.utils.types import SplitType + + +class SplitSpecificationV1Mixin(BaseModel): + """ + Mixin class to add a split field to a benchmark. This is the V1 implementation. + + The split is defined as a (train, test) tuple, where train is a list of indices and + test is a dictionary that maps test set names to lists of indices. + + Warning: Scalability + The simple list-based representation we use for the split in this first implementation doesn't scale well. + We therefore worked on a V2 implementation that uses roaring bitmaps. + See [`SplitSpecificationV2Mixin`][`polaris.experimental._split_v2.SplitSpecificationV2Mixin`] for more details. + + Attributes: + split: The predefined train-test split to use for evaluation. + """ + + split: SplitType + + @model_validator(mode="after") + def _validate_split(self) -> Self: + """ + Verifies that: + 1) There are no empty test partitions + 2) There is no overlap between the train and test set + 3) There is no duplicate indices in any of the sets + """ + + if not isinstance(self.split[1], dict): + self.split = self.split[0], {"test": self.split[1]} + split = self.split + + # Train partition can be empty (zero-shot) + # Test partitions cannot be empty + if any(len(v) == 0 for v in split[1].values()): + raise InvalidBenchmarkError("The predefined split contains empty test partitions") + + train_idx_list = split[0] + full_test_idx_list = list(chain.from_iterable(split[1].values())) + + if len(train_idx_list) == 0: + logger.info( + "This benchmark only specifies a test set. It will return an empty train set in `get_train_test_split()`" + ) + + train_idx_set = set(train_idx_list) + full_test_idx_set = set(full_test_idx_list) + + # The train and test indices do not overlap + if len(train_idx_set & full_test_idx_set) > 0: + raise InvalidBenchmarkError("The predefined split specifies overlapping train and test sets") + + # Check for duplicate indices within the train set + if len(train_idx_set) != len(train_idx_list): + raise InvalidBenchmarkError("The training set contains duplicate indices") + + # Check for duplicate indices within a given test set. Because a user can specify + # multiple test sets for a given benchmark and it is acceptable for indices to be shared + # across test sets, we check for duplicates in each test set independently. + for test_set_name, test_set_idx_list in split[1].items(): + if len(test_set_idx_list) != len(set(test_set_idx_list)): + raise InvalidBenchmarkError( + f'Test set with name "{test_set_name}" contains duplicate indices' + ) + + return self + + @field_serializer("split") + def _serialize_split(self, v: SplitType): + """Convert any tuple to list to make sure it's serializable""" + return listit(v) + + @computed_field + @property + def test_set_sizes(self) -> dict[str, int]: + """The sizes of the test sets.""" + return {k: len(v) for k, v in self.split[1].items()} + + @computed_field + @property + def n_test_sets(self) -> int: + """The number of test sets""" + return len(self.split[1]) + + @computed_field + @property + def n_train_datapoints(self) -> int: + """The size of the train set.""" + return len(self.split[0]) + + @computed_field + @property + def test_set_labels(self) -> list[str]: + """The labels of the test sets.""" + return sorted(list(self.split[1].keys())) + + @computed_field + @property + def n_test_datapoints(self) -> dict[str, int]: + """The size of (each of) the test set(s).""" + if self.n_test_sets == 1: + return {"test": len(self.split[1]["test"])} + else: + return {k: len(v) for k, v in self.split[1].items()} diff --git a/polaris/benchmark/_task.py b/polaris/benchmark/_task.py new file mode 100644 index 00000000..e18d4595 --- /dev/null +++ b/polaris/benchmark/_task.py @@ -0,0 +1,152 @@ +from typing import Collection, Sequence + +from pydantic import ( + BaseModel, + Field, + ValidationInfo, + computed_field, + field_serializer, + field_validator, + model_validator, +) +from typing_extensions import Self + +from polaris.evaluate import Metric +from polaris.utils.errors import InvalidBenchmarkError +from polaris.utils.types import ColumnName, TargetType, TaskType + + +class PredictiveTaskSpecificationMixin(BaseModel): + """A mixin for predictive task benchmarks. + + Attributes: + target_cols: The column(s) of the original dataset that should be used as the target. + input_cols: The column(s) of the original dataset that should be used as input. + metrics: The metrics to use for evaluating performance. + main_metric: The main metric used to rank methods. If `None`, this defaults to the first of the `metrics` field. + target_types: A dictionary that maps target columns to their type. If not specified, this is automatically inferred. + """ + + target_cols: set[ColumnName] = Field(min_length=1) + input_cols: set[ColumnName] = Field(min_length=1) + metrics: set[Metric] = Field(min_length=1) + main_metric: Metric | str + target_types: dict[ColumnName, TargetType] = Field(default_factory=dict, validate_default=True) + + @field_validator("target_cols", "input_cols", mode="before") + @classmethod + def _parse_cols(cls, v: str | Sequence[str], info: ValidationInfo) -> set[str]: + """ + Normalize columns input values to a set. + """ + if isinstance(v, str): + v = {v} + else: + v = set(v) + return v + + @field_validator("target_types", mode="before") + @classmethod + def _parse_target_types( + cls, v: dict[ColumnName, TargetType | str | None] + ) -> dict[ColumnName, TargetType]: + """ + Converts the target types to TargetType enums if they are strings. + """ + return { + target: TargetType(val) if isinstance(val, str) else val + for target, val in v.items() + if val is not None + } + + @field_validator("metrics", mode="before") + @classmethod + def _validate_metrics(cls, v: str | Metric | Collection[str | Metric]) -> set[Metric]: + """ + Verifies all specified metrics are either a Metric object or a valid metric name. + Also verifies there are no duplicate metrics. + + If there are multiple test sets, it is assumed the same metrics are used across test sets. + """ + if isinstance(v, str): + v = {"label": v} + if not isinstance(v, Collection): + v = [v] + + def _convert(m: str | dict | Metric) -> Metric: + if isinstance(m, str): + return Metric(label=m) + if isinstance(m, dict): + return Metric(**m) + return m + + v = [_convert(m) for m in v] + + unique_metrics = set(v) + + if len(unique_metrics) != len(v): + raise InvalidBenchmarkError("The benchmark specifies duplicate metrics.") + + unique_names = {m.name for m in unique_metrics} + if len(unique_names) != len(unique_metrics): + raise InvalidBenchmarkError( + "The metrics of a benchmark need to have unique names. Specify a custom name with Metric(custom_name=...)" + ) + + return unique_metrics + + @model_validator(mode="after") + def _validate_main_metric_is_in_metrics(self) -> Self: + if isinstance(self.main_metric, str): + for m in self.metrics: + if m.name == self.main_metric: + self.main_metric = m + break + if self.main_metric not in self.metrics: + raise InvalidBenchmarkError("The main metric should be one of the specified metrics") + return self + + @field_serializer("metrics") + def _serialize_metrics(self, value: set[Metric]) -> list[Metric]: + """ + Convert the set to a list. Since metrics are models and will be converted to dict, + they will not be hashable members of a set. + """ + return list(value) + + @model_validator(mode="after") + def _validate_target_types(self) -> Self: + """ + Verifies that all target types are for benchmark targets. + """ + columns = set(self.target_types.keys()) + if not columns.issubset(self.target_cols): + raise InvalidBenchmarkError( + f"Not all specified target types were found in the target columns. {columns} - {self.target_cols}" + ) + return self + + @field_serializer("main_metric") + def _serialize_main_metric(value: Metric) -> str: + """ + Convert the Metric to it's name + """ + return value.name + + @field_serializer("target_types") + def _serialize_target_types(self, target_types): + """ + Convert from enum to string to make sure it's serializable + """ + return {k: v.value for k, v in target_types.items()} + + @field_serializer("target_cols", "input_cols") + def _serialize_columns(self, v: set[str]) -> list[str]: + return list(v) + + @computed_field + @property + def task_type(self) -> str: + """The high-level task type of the benchmark.""" + v = TaskType.MULTI_TASK if len(self.target_cols) > 1 else TaskType.SINGLE_TASK + return v.value diff --git a/polaris/competition/__init__.py b/polaris/competition/__init__.py index 376773e8..57e2bb9b 100644 --- a/polaris/competition/__init__.py +++ b/polaris/competition/__init__.py @@ -1,3 +1,226 @@ -from polaris.competition._competition import CompetitionSpecification +from collections import defaultdict +from datetime import datetime +from itertools import chain +from typing import Callable -__all__ = ["CompetitionSpecification"] +from pydantic import Field, computed_field, model_validator +from typing_extensions import Self + +from polaris.benchmark._split import SplitSpecificationV1Mixin +from polaris.benchmark._task import PredictiveTaskSpecificationMixin +from polaris.dataset import DatasetV2, Subset +from polaris.evaluate import CompetitionPredictions +from polaris.utils.dict2html import dict2html +from polaris.utils.errors import InvalidCompetitionError +from polaris.utils.types import ( + ColumnName, + HttpUrlString, + HubOwner, + HubUser, + IncomingPredictionsType, + SlugCompatibleStringType, +) + + +class CompetitionSpecification(DatasetV2, PredictiveTaskSpecificationMixin, SplitSpecificationV1Mixin): + """An instance of this class represents a Polaris competition. It defines fields and functionality + that in combination with the [`DatasetV2`][polaris.dataset.DatasetV2] class, allow + users to participate in competitions hosted on Polaris Hub. + + Examples: + Basic API usage: + ```python + import polaris as po + + # Load the benchmark from the Hub + competition = po.load_competition("dummy-user/dummy-name") + + # Get the train and test data-loaders + train, test = competition.get_train_test_split() + + # Use the training data to train your model + # Get the input as an array with 'train.inputs' and 'train.targets' + # Or simply iterate over the train object. + for x, y in train: + ... + + # Work your magic to accurately predict the test set + prediction_values = np.array([0.0 for x in test]) + + # Submit your predictions + competition.submit_predictions( + prediction_name="first-prediction", + prediction_owner="dummy-user", + report_url="REPORT_URL", + predictions=prediction_values, + ) + ``` + + Attributes: + start_time: The time at which the competition starts accepting prediction submissions. + end_time: The time at which the competition stops accepting prediction submissions. + n_classes: The number of classes within each target column that defines a classification task. + + For additional meta-data attributes, see the base classes. + """ + + _artifact_type = "competition" + + dataset: None = None + + start_time: datetime + end_time: datetime + n_classes: dict[ColumnName, int | None] = Field(..., default_factory=lambda: defaultdict(None)) + + @model_validator(mode="after") + def _validate_split_in_dataset(self) -> Self: + """ + All indices are valid given the dataset. We check the len of `self` here because a + competition entity includes both the dataset and benchmark in one artifact. + """ + max_i = len(self) + if any(i < 0 or i >= max_i for i in chain(self.split[0], *self.split[1].values())): + raise InvalidCompetitionError("The predefined split contains invalid indices") + + return self + + @model_validator(mode="after") + def _validate_cols_in_dataset(self) -> Self: + """ + Verifies that all specified columns are present in the dataset. + """ + columns = self.target_cols | self.input_cols + dataset_columns = set(self.columns) + if not columns.issubset(dataset_columns): + raise InvalidCompetitionError("Not all target or input columns were found in the dataset.") + + return self + + @model_validator(mode="after") + def _validate_n_classes(self) -> Self: + """ + The number of classes for each of the target columns. + """ + columns = set(self.n_classes.keys()) + if not columns.issubset(self.target_cols): + raise InvalidCompetitionError("Not all target class members were found in the target columns.") + + return self + + @computed_field + @property + def dataset_artifact_id(self) -> str: + return self.artifact_id + + def _get_subset(self, indices, hide_targets=True, featurization_fn=None) -> Subset: + """Returns a [`Subset`][polaris.dataset.Subset] using the given indices. Used + internally to construct the train and test sets.""" + return Subset( + dataset=self, + indices=indices, + input_cols=self.input_cols, + target_cols=self.target_cols, + hide_targets=hide_targets, + featurization_fn=featurization_fn, + ) + + def _get_test_sets( + self, hide_targets=True, featurization_fn: Callable | None = None + ) -> dict[str, Subset]: + """ + Construct the test set(s), given the split in the competition specification. Used + internally to construct the test set for client use and evaluation. + """ + test_split = self.split[1] + return { + k: self._get_subset(v, hide_targets=hide_targets, featurization_fn=featurization_fn) + for k, v in test_split.items() + } + + def get_train_test_split( + self, featurization_fn: Callable | None = None + ) -> tuple[Subset, Subset | dict[str, Subset]]: + """Construct the train and test sets, given the split in the competition specification. + + Returns [`Subset`][polaris.dataset.Subset] objects, which offer several ways of accessing the data + and can thus easily serve as a basis to build framework-specific (e.g. PyTorch, Tensorflow) + data-loaders on top of. + + Args: + featurization_fn: A function to apply to the input data. If a multi-input benchmark, this function + expects an input in the format specified by the `input_format` parameter. + + Returns: + A tuple with the train `Subset` and test `Subset` objects. + If there are multiple test sets, these are returned in a dictionary and each test set has + an associated name. The targets of the test set can not be accessed. + """ + + train = self._get_subset(self.split[0], hide_targets=False, featurization_fn=featurization_fn) + test = self._get_test_sets(hide_targets=True, featurization_fn=featurization_fn) + + # For improved UX, we return the object instead of the dictionary if there is only one test set. + # Internally, however, assume that the test set is always a dictionary simplifies the code. + if len(test) == 1: + test = test["test"] + return train, test + + def submit_predictions( + self, + predictions: IncomingPredictionsType, + prediction_name: SlugCompatibleStringType, + prediction_owner: str, + report_url: HttpUrlString, + contributors: list[HubUser] | None = None, + github_url: HttpUrlString | None = None, + description: str = "", + tags: list[str] | None = None, + user_attributes: dict[str, str] | None = None, + ) -> None: + """ + Convenient wrapper around the + [`PolarisHubClient.submit_competition_predictions`][polaris.hub.client.PolarisHubClient.submit_competition_predictions] method. + It handles the creation of a standardized predictions object, which is expected by the Hub, automatically. + + Args: + prediction_name: The name of the prediction. + prediction_owner: The slug of the user/organization which owns the prediction. + predictions: The predictions for each test set defined in the competition. + report_url: A URL to a report/paper/write-up which describes the methods used to generate the predictions. + contributors: The users credited with generating these predictions. + github_url: An optional URL to a code repository containing the code used to generated these predictions. + description: An optional and short description of the predictions. + tags: An optional list of tags to categorize the prediction by. + user_attributes: An optional dict with additional, textual user attributes. + """ + from polaris.hub.client import PolarisHubClient + + standardized_predictions = CompetitionPredictions( + name=prediction_name, + owner=HubOwner(slug=prediction_owner), + predictions=predictions, + report_url=report_url, + contributors=contributors or [], + github_url=github_url, + description=description, + tags=tags or [], + user_attributes=user_attributes or {}, + target_labels=self.target_cols, + test_set_labels=self.test_set_labels, + test_set_sizes=self.test_set_sizes, + ) + + with PolarisHubClient() as client: + client.submit_competition_predictions( + competition=self, competition_predictions=standardized_predictions + ) + + def _repr_html_(self): + """For pretty printing in Jupyter.""" + return dict2html(self.model_dump(exclude={"zarr_manifest_path", "zarr_manifest_md5sum", "split"})) + + def __repr__(self): + return self.model_dump_json(exclude={"zarr_manifest_path", "zarr_manifest_md5sum", "split"}, indent=2) + + def __str__(self): + return self.__repr__() diff --git a/polaris/competition/_competition.py b/polaris/competition/_competition.py deleted file mode 100644 index bd28f617..00000000 --- a/polaris/competition/_competition.py +++ /dev/null @@ -1,45 +0,0 @@ -from datetime import datetime -from typing import Optional - -from polaris.benchmark._base import BenchmarkV1Specification -from polaris.evaluate import CompetitionPredictions -from polaris.hub.settings import PolarisHubSettings -from polaris.utils.types import HubOwner - - -class CompetitionSpecification(BenchmarkV1Specification): - """Much of the underlying data model and logic is shared across Benchmarks and Competitions, and - anything within this class serves as a point of differentiation between the two. - - Attributes: - owner: A slug-compatible name for the owner of the competition. This is redefined such - that it is required. - start_time: The time at which the competition becomes active and interactable. - end_time: The time at which the competition ends and is no longer interactable. - """ - - _artifact_type = "competition" - - # Additional properties specific to Competitions - owner: HubOwner - start_time: datetime | None = None - end_time: datetime | None = None - - def evaluate( - self, - predictions: CompetitionPredictions, - settings: Optional[PolarisHubSettings] = None, - cache_auth_token: bool = True, - **kwargs: dict, - ): - """Light convenience wrapper around - [`PolarisHubClient.evaluate_competition`][polaris.hub.client.PolarisHubClient.evaluate_competition]. - """ - from polaris.hub.client import PolarisHubClient - - with PolarisHubClient( - settings=settings, - cache_auth_token=cache_auth_token, - **kwargs, - ) as client: - return client.evaluate_competition(self, predictions) diff --git a/polaris/dataset/__init__.py b/polaris/dataset/__init__.py index 2cc4fd9b..8084749c 100644 --- a/polaris/dataset/__init__.py +++ b/polaris/dataset/__init__.py @@ -1,19 +1,19 @@ from polaris.dataset._column import ColumnAnnotation, KnownContentType, Modality -from polaris.dataset._competition_dataset import CompetitionDataset from polaris.dataset._dataset import DatasetV1, DatasetV1 as Dataset +from polaris.dataset._dataset_v2 import DatasetV2 from polaris.dataset._factory import DatasetFactory, create_dataset_from_file, create_dataset_from_files from polaris.dataset._subset import Subset + __all__ = [ + "create_dataset_from_file", + "create_dataset_from_files", "ColumnAnnotation", "Dataset", - "CompetitionDataset", - "Subset", - "Modality", - "KnownContentType", "DatasetFactory", - "create_dataset_from_file", - "create_dataset_from_files", "DatasetV1", - "Dataset", + "DatasetV2", + "KnownContentType", + "Modality", + "Subset", ] diff --git a/polaris/dataset/_base.py b/polaris/dataset/_base.py index fd0af212..1cc2ea1f 100644 --- a/polaris/dataset/_base.py +++ b/polaris/dataset/_base.py @@ -63,7 +63,7 @@ class BaseDataset(BaseArtifactModel, abc.ABC): license: The dataset license. Polaris only supports some Creative Commons licenses. See [`SupportedLicenseType`][polaris.utils.types.SupportedLicenseType] for accepted ID values. curation_reference: A reference to the curation process, e.g. a DOI, Github repo or URI. - For additional meta-data attributes, see the [`BaseArtifactModel`][polaris._artifact.BaseArtifactModel] class. + For additional meta-data attributes, see the base classes. Raises: InvalidDatasetError: If the dataset does not conform to the Pydantic data-model specification. diff --git a/polaris/dataset/_competition_dataset.py b/polaris/dataset/_competition_dataset.py deleted file mode 100644 index 8f8e74cb..00000000 --- a/polaris/dataset/_competition_dataset.py +++ /dev/null @@ -1,25 +0,0 @@ -from pydantic import model_validator -from typing_extensions import Self - -from polaris.dataset._dataset import DatasetV1 -from polaris.utils.errors import InvalidCompetitionError - - -class CompetitionDataset(DatasetV1): - """Dataset subclass for Polaris competitions. - - In addition to the data model and logic of the base Dataset class, - this class adds additional functionality which validates certain aspects - of the training data for a given competition. - """ - - _artifact_type = "competitionDataset" - - @model_validator(mode="after") - def _validate_model(self) -> Self: - """We reject the instantiation of competition datasets which leverage Zarr for the time being""" - - if self.uses_zarr: - raise InvalidCompetitionError("Pointer columns are not currently supported in competitions.") - - return self diff --git a/polaris/dataset/_dataset.py b/polaris/dataset/_dataset.py index c4b49b1d..9aaa5deb 100644 --- a/polaris/dataset/_dataset.py +++ b/polaris/dataset/_dataset.py @@ -43,7 +43,7 @@ class DatasetV1(BaseDataset, ChecksumMixin): table: The core data-structure, storing data-points in a row-wise manner. Can be specified as either a path to a `.parquet` file or a `pandas.DataFrame`. - For additional meta-data attributes, see the [`BaseDataset`][polaris.dataset._base.BaseDataset] class. + For additional meta-data attributes, see the base classes. Raises: InvalidDatasetError: If the dataset does not conform to the Pydantic data-model specification. diff --git a/polaris/experimental/_dataset_v2.py b/polaris/dataset/_dataset_v2.py similarity index 96% rename from polaris/experimental/_dataset_v2.py rename to polaris/dataset/_dataset_v2.py index 7b362fdc..5144f1f7 100644 --- a/polaris/experimental/_dataset_v2.py +++ b/polaris/dataset/_dataset_v2.py @@ -29,13 +29,10 @@ class DatasetV2(BaseDataset): this was a bottleneck when the number of data points grew large. Even with the pointer columns, you still need to load all pointers into memory. V2 therefore switches to a Zarr-only format. - Info: This feature is still experimental - The DatasetV2 is in active development and will likely undergo breaking changes before release. - Attributes: - zarr_root_path: The path to the Zarr archive. Different from V1, this is now required. + zarr_root_path: Required path to a Zarr archive. - For additional meta-data attributes, see the [`BaseDataset`][polaris._dataset.BaseDataset] class. + For additional meta-data attributes, see the base classes. Raises: InvalidDatasetError: If the dataset does not conform to the Pydantic data-model specification. diff --git a/polaris/evaluate/__init__.py b/polaris/evaluate/__init__.py index 3405ecac..67423f63 100644 --- a/polaris/evaluate/__init__.py +++ b/polaris/evaluate/__init__.py @@ -1,18 +1,17 @@ +from polaris.evaluate._metadata import ResultsMetadata from polaris.evaluate._metric import Metric, MetricInfo -from polaris.evaluate._predictions import BenchmarkPredictions +from polaris.evaluate._predictions import BenchmarkPredictions, CompetitionPredictions from polaris.evaluate._results import ( BenchmarkResults, - CompetitionPredictions, CompetitionResults, EvaluationResult, - ResultsMetadata, ) from polaris.evaluate.utils import evaluate_benchmark __all__ = [ + "ResultsMetadata", "Metric", "MetricInfo", - "ResultsMetadata", "EvaluationResult", "BenchmarkResults", "CompetitionResults", diff --git a/polaris/evaluate/_metadata.py b/polaris/evaluate/_metadata.py new file mode 100644 index 00000000..8dd35b21 --- /dev/null +++ b/polaris/evaluate/_metadata.py @@ -0,0 +1,34 @@ +from datetime import datetime + +from pydantic import Field, PrivateAttr + +from polaris._artifact import BaseArtifactModel +from polaris.utils.dict2html import dict2html +from polaris.utils.types import HttpUrlString, HubUser + + +class ResultsMetadata(BaseArtifactModel): + """Base class for evaluation results + + Attributes: + github_url: The URL to the code repository that was used to generate these results. + paper_url: The URL to the paper describing the methodology used to generate these results. + contributors: The users that are credited for these results. + + For additional meta-data attributes, see the base classes. + """ + + # Additional meta-data + github_url: HttpUrlString | None = None + paper_url: HttpUrlString | None = None + contributors: list[HubUser] = Field(default_factory=list) + + # Private attributes + _created_at: datetime = PrivateAttr(default_factory=datetime.now) + + def _repr_html_(self) -> str: + """For pretty-printing in Jupyter Notebooks""" + return dict2html(self.model_dump()) + + def __repr__(self): + return self.model_dump_json(indent=2) diff --git a/polaris/evaluate/_predictions.py b/polaris/evaluate/_predictions.py index c56030cb..472f9984 100644 --- a/polaris/evaluate/_predictions.py +++ b/polaris/evaluate/_predictions.py @@ -4,6 +4,7 @@ from pydantic import ( BaseModel, ConfigDict, + Field, TypeAdapter, field_serializer, field_validator, @@ -11,8 +12,15 @@ ) from typing_extensions import Self +from polaris.evaluate import ResultsMetadata from polaris.utils.misc import convert_lists_to_arrays -from polaris.utils.types import IncomingPredictionsType, PredictionsType +from polaris.utils.types import ( + HttpUrlString, + HubOwner, + IncomingPredictionsType, + PredictionsType, + SlugCompatibleStringType, +) class BenchmarkPredictions(BaseModel): @@ -26,6 +34,7 @@ class BenchmarkPredictions(BaseModel): predictions: The predictions for the benchmark. target_labels: The target columns for the associated benchmark. test_set_labels: The names of the test sets for the associated benchmark. + test_set_sizes: The number of rows in each test set for the associated benchmark. """ predictions: PredictionsType @@ -82,12 +91,18 @@ def _validate_predictions(cls, data: dict) -> dict: predictions = convert_lists_to_arrays(predictions) predictions = cls._normalize_predictions(predictions, target_labels, test_set_labels) - return { - "predictions": predictions, - "target_labels": target_labels, - "test_set_labels": test_set_labels, - "test_set_sizes": test_set_sizes, - } + # Update class data with the normalized fields. Use of the `update()` method + # is required to prevent overwriting class data when this class is inherited. + data.update( + { + "predictions": predictions, + "target_labels": target_labels, + "test_set_labels": test_set_labels, + "test_set_sizes": test_set_sizes, + } + ) + + return data @model_validator(mode="after") def check_test_set_size(self) -> Self: @@ -127,6 +142,7 @@ def _normalize_predictions( raise ValueError( "The predictions for single-task, single test set benchmarks should be a numpy array." ) + predictions = {test_set_labels[0]: {target_labels[0]: predictions}} # (3) Single-task, multiple test sets: We expect a dictionary with the test set labels as keys. @@ -231,3 +247,31 @@ def flatten(self) -> np.ndarray: def __len__(self) -> int: """Return the total number of predictions""" return self.get_size() + + +class CompetitionPredictions(BenchmarkPredictions, ResultsMetadata): + """ + Predictions for competition benchmarks. + + This object is to be used as input to + [`PolarisHubClient.submit_competition_predictions`][polaris.hub.client.PolarisHubClient.submit_competition_predictions]. + It is used to ensure that the structure of the predictions are compatible with evaluation methods on the Polaris Hub. + In addition to the predictions, it contains meta-data that describes a predictions object. + + Attributes: + name: A slug-compatible name for the artifact. It is redeclared here to be required. + owner: A slug-compatible name for the owner of the artifact. It is redeclared here to be required. + report_url: A URL to a report/paper/write-up which describes the methods used to generate the predictions. + """ + + _artifact_type = "competition-prediction" + + name: SlugCompatibleStringType + owner: HubOwner + paper_url: HttpUrlString = Field(alias="report_url", serialization_alias="reportUrl") + + def __repr__(self): + return self.model_dump_json(by_alias=True, indent=2) + + def __str__(self): + return self.__repr__() diff --git a/polaris/evaluate/_results.py b/polaris/evaluate/_results.py index 98202f0b..92a4fc51 100644 --- a/polaris/evaluate/_results.py +++ b/polaris/evaluate/_results.py @@ -1,4 +1,3 @@ -from datetime import datetime from typing import ClassVar import pandas as pd @@ -6,7 +5,6 @@ BaseModel, ConfigDict, Field, - PrivateAttr, computed_field, field_serializer, field_validator, @@ -14,16 +12,12 @@ ) from pydantic.alias_generators import to_camel -from polaris._artifact import BaseArtifactModel -from polaris.evaluate import BenchmarkPredictions -from polaris.utils.dict2html import dict2html +from polaris.evaluate import ResultsMetadata, BenchmarkPredictions from polaris.utils.errors import InvalidResultError from polaris.utils.misc import slugify from polaris.utils.types import ( AccessType, - HttpUrlString, HubOwner, - HubUser, SlugCompatibleStringType, ) @@ -46,33 +40,6 @@ class ResultRecords(BaseModel): model_config = ConfigDict(alias_generator=to_camel, populate_by_name=True) -class ResultsMetadata(BaseArtifactModel): - """Base class for evaluation results - - Attributes: - github_url: The URL to the GitHub repository of the code used to generate these results. - paper_url: The URL to the paper describing the methodology used to generate these results. - contributors: The users that are credited for these results. - _created_at: The time-stamp at which the results were created. Automatically set. - For additional meta-data attributes, see the [`BaseArtifactModel`][polaris._artifact.BaseArtifactModel] class. - """ - - # Additional meta-data - github_url: HttpUrlString | None = None - paper_url: HttpUrlString | None = None - contributors: list[HubUser] = Field(default_factory=list) - - # Private attributes - _created_at: datetime = PrivateAttr(default_factory=datetime.now) - - def _repr_html_(self) -> str: - """For pretty-printing in Jupyter Notebooks""" - return dict2html(self.model_dump()) - - def __repr__(self): - return self.model_dump_json(indent=2) - - class EvaluationResult(ResultsMetadata): """Class for saving evaluation results @@ -87,13 +54,13 @@ class EvaluationResult(ResultsMetadata): question: Categorizing methods An open question is how to best categorize a methodology (e.g. a model). - This is needed since we would like to be able to aggregate results across benchmarks/competitions too, + This is needed since we would like to be able to aggregate results across benchmarks too, to say something about which (type of) methods performs best _in general_. Attributes: results: Evaluation results are stored directly in a dataframe or in a serialized, JSON compatible dict that can be decoded into the associated tabular format. - For additional meta-data attributes, see the [`ResultsMetadata`][polaris.evaluate._results.ResultsMetadata] class. + For additional meta-data attributes, see the base classes. """ # Define the columns of the results table diff --git a/polaris/experimental/_benchmark_v2.py b/polaris/experimental/_benchmark_v2.py index fc081021..7ea916db 100644 --- a/polaris/experimental/_benchmark_v2.py +++ b/polaris/experimental/_benchmark_v2.py @@ -1,144 +1,19 @@ -from functools import cached_property -from hashlib import md5 -from typing import Any, Callable, ClassVar, Generator, Literal, Sequence +from typing import Any, Callable, ClassVar, Literal -from loguru import logger -from pydantic import BaseModel, ConfigDict, Field, computed_field, field_validator, model_validator -from pydantic.alias_generators import to_camel -from pyroaring import BitMap +from pydantic import Field, field_validator, model_validator from typing_extensions import Self from polaris.benchmark import BenchmarkSpecification -from polaris.benchmark._base import ColumnName -from polaris.dataset import Subset -from polaris.experimental._dataset_v2 import DatasetV2 +from polaris.dataset import DatasetV2, Subset +from polaris.experimental._split_v2 import SplitSpecificationV2Mixin from polaris.utils.errors import InvalidBenchmarkError +from polaris.utils.types import ColumnName -class IndexSet(BaseModel): - """ - A set of indices for a split, either training or test. - - It wraps a Roaring Bitmap object to store the indices, and provides - useful properties when serializing for upload to the Hub. - """ - - model_config = ConfigDict(arbitrary_types_allowed=True, alias_generator=to_camel) - - indices: BitMap = Field(default_factory=BitMap, frozen=True, exclude=True) - - @field_validator("indices", mode="before") - @classmethod - def _validate_indices(cls, v: BitMap | Sequence[int]) -> BitMap: - """ - Accepts an initial sequence of ints, and turn it into a BitMap - """ - if isinstance(v, BitMap): - return v - return BitMap(v) - - @computed_field - @cached_property - def datapoints(self) -> int: - return len(self.indices) - - @computed_field - @cached_property - def md5_checksum(self) -> str: - return md5(self.serialize()).hexdigest() - - def intersect(self, other: Self) -> bool: - return self.indices.intersect(other.indices) - - def serialize(self) -> bytes: - return self.indices.serialize() - - @staticmethod - def deserialize(index_set: bytes) -> "IndexSet": - return IndexSet(indices=BitMap.deserialize(index_set)) - - -class SplitV2(BaseModel): - training: IndexSet - test: IndexSet - - @field_validator("training", "test", mode="before") - @classmethod - def _parse_index_sets(cls, v: bytes | IndexSet) -> bytes | IndexSet: - """ - Accepted a binary serialized IndexSet - """ - if isinstance(v, bytes): - return IndexSet.deserialize(v) - return v - - @field_validator("training") - @classmethod - def _validate_training_set(cls, v: IndexSet) -> IndexSet: - """ - Training index set can be empty (zero-shot) - """ - if v.datapoints == 0: - logger.info( - "This benchmark only specifies a test set. It will return an empty train set in `get_train_test_split()`" - ) - return v - - @field_validator("test") - @classmethod - def _validate_test_set(cls, v: IndexSet) -> IndexSet: - """ - Test index set cannot be empty - """ - if v.datapoints == 0: - raise InvalidBenchmarkError("The predefined split contains empty test partitions") - return v - - @model_validator(mode="after") - def validate_set_overlap(self) -> Self: - """ - The training and test index sets do not overlap - """ - if self.training.intersect(self.test): - raise InvalidBenchmarkError("The predefined split specifies overlapping train and test sets") - return self - - @property - def n_train_datapoints(self) -> int: - """ - The size of the train set. - """ - return self.training.datapoints - - @property - def n_test_sets(self) -> int: - """ - The number of test sets - """ - # TODO: Until we support multi-test benchmarks - return 1 - - @property - def n_test_datapoints(self) -> dict[str, int]: - """ - The size of (each of) the test set(s). - """ - return {"test": self.test.datapoints} - - @property - def max_index(self) -> int: - return max(self.training.indices.max(), self.test.indices.max()) - - def test_items(self) -> Generator[tuple[str, IndexSet], None, None]: - # TODO: Until we support multi-test benchmarks - yield "test", self.test - - -class BenchmarkV2Specification(BenchmarkSpecification): +class BenchmarkV2Specification(SplitSpecificationV2Mixin, BenchmarkSpecification): _version: ClassVar[Literal[2]] = 2 dataset: DatasetV2 = Field(exclude=True) - split: SplitV2 n_classes: dict[ColumnName, int] @field_validator("dataset", mode="before") @@ -181,34 +56,6 @@ def _validate_split_in_dataset(self) -> Self: return self - @computed_field - @property - def n_train_datapoints(self) -> int: - """The size of the train set.""" - return self.split.n_train_datapoints - - @computed_field - @property - def n_test_sets(self) -> int: - """The number of test sets""" - return self.split.n_test_sets - - @computed_field - @property - def n_test_datapoints(self) -> dict[str, int]: - """The size of (each of) the test set(s).""" - return self.split.n_test_datapoints - - @computed_field - @property - def test_set_sizes(self) -> dict[str, int]: - return {label: index_set.datapoints for label, index_set in self.split.test_items()} - - @computed_field - @property - def test_set_labels(self) -> list[str]: - return list(label for label, _ in self.split.test_items()) - def _get_test_sets( self, hide_targets=True, featurization_fn: Callable | None = None ) -> dict[str, Subset]: diff --git a/polaris/experimental/_split_v2.py b/polaris/experimental/_split_v2.py new file mode 100644 index 00000000..ef0b11f0 --- /dev/null +++ b/polaris/experimental/_split_v2.py @@ -0,0 +1,174 @@ +from functools import cached_property +from hashlib import md5 +from typing import Generator, Sequence + +from loguru import logger +from pydantic import BaseModel, ConfigDict, Field, computed_field, field_validator, model_validator +from pydantic.alias_generators import to_camel +from pyroaring import BitMap +from typing_extensions import Self + +from polaris.utils.errors import InvalidBenchmarkError + + +class IndexSet(BaseModel): + """ + A set of indices for a split, either training or test. + + It wraps a Roaring Bitmap object to store the indices, and provides + useful properties when serializing for upload to the Hub. + """ + + model_config = ConfigDict(arbitrary_types_allowed=True, alias_generator=to_camel) + + indices: BitMap = Field(default_factory=BitMap, frozen=True, exclude=True) + + @field_validator("indices", mode="before") + @classmethod + def _validate_indices(cls, v: BitMap | Sequence[int]) -> BitMap: + """ + Accepts an initial sequence of ints, and turn it into a BitMap + """ + if isinstance(v, BitMap): + return v + return BitMap(v) + + @computed_field + @cached_property + def datapoints(self) -> int: + return len(self.indices) + + @computed_field + @cached_property + def md5_checksum(self) -> str: + return md5(self.serialize()).hexdigest() + + def intersect(self, other: Self) -> bool: + return self.indices.intersect(other.indices) + + def serialize(self) -> bytes: + return self.indices.serialize() + + @staticmethod + def deserialize(index_set: bytes) -> "IndexSet": + return IndexSet(indices=BitMap.deserialize(index_set)) + + +class SplitV2(BaseModel): + training: IndexSet + test: IndexSet + + @field_validator("training", "test", mode="before") + @classmethod + def _parse_index_sets(cls, v: bytes | IndexSet) -> bytes | IndexSet: + """ + Accepted a binary serialized IndexSet + """ + if isinstance(v, bytes): + return IndexSet.deserialize(v) + return v + + @field_validator("training") + @classmethod + def _validate_training_set(cls, v: IndexSet) -> IndexSet: + """ + Training index set can be empty (zero-shot) + """ + if v.datapoints == 0: + logger.info( + "This benchmark only specifies a test set. It will return an empty train set in `get_train_test_split()`" + ) + return v + + @field_validator("test") + @classmethod + def _validate_test_set(cls, v: IndexSet) -> IndexSet: + """ + Test index set cannot be empty + """ + if v.datapoints == 0: + raise InvalidBenchmarkError("The predefined split contains empty test partitions") + return v + + @model_validator(mode="after") + def validate_set_overlap(self) -> Self: + """ + The training and test index sets do not overlap + """ + if self.training.intersect(self.test): + raise InvalidBenchmarkError("The predefined split specifies overlapping train and test sets") + return self + + @property + def n_train_datapoints(self) -> int: + """ + The size of the train set. + """ + return self.training.datapoints + + @property + def n_test_sets(self) -> int: + """ + The number of test sets + """ + # TODO: Until we support multi-test benchmarks + return 1 + + @property + def n_test_datapoints(self) -> dict[str, int]: + """ + The size of (each of) the test set(s). + """ + # TODO: Until we support multi-test benchmarks + return {"test": self.test.datapoints} + + @property + def max_index(self) -> int: + # TODO: Until we support multi-test benchmarks (need) + return max(self.training.indices.max(), self.test.indices.max()) + + def test_items(self) -> Generator[tuple[str, IndexSet], None, None]: + # TODO: Until we support multi-test benchmarks + yield "test", self.test + + +class SplitSpecificationV2Mixin(BaseModel): + """ + Mixin class to add a split field to a benchmark. This is the V2 implementation. + + The internal representation for the split is a roaring bitmap, + which drastically improves scalability over the V1 implementation. + + Attributes: + split: The predefined train-test split to use for evaluation. + """ + + split: SplitV2 + + @computed_field + @property + def n_train_datapoints(self) -> int: + """The size of the train set.""" + return self.split.n_train_datapoints + + @computed_field + @property + def n_test_sets(self) -> int: + """The number of test sets""" + return self.split.n_test_sets + + @computed_field + @property + def n_test_datapoints(self) -> dict[str, int]: + """The size of (each of) the test set(s).""" + return self.split.n_test_datapoints + + @computed_field + @property + def test_set_sizes(self) -> dict[str, int]: + return {label: index_set.datapoints for label, index_set in self.split.test_items()} + + @computed_field + @property + def test_set_labels(self) -> list[str]: + return list(label for label, _ in self.split.test_items()) diff --git a/polaris/hub/client.py b/polaris/hub/client.py index 649fd7d9..0d06adba 100644 --- a/polaris/hub/client.py +++ b/polaris/hub/client.py @@ -22,10 +22,9 @@ SingleTaskBenchmarkSpecification, ) from polaris.competition import CompetitionSpecification -from polaris.dataset import CompetitionDataset, Dataset, DatasetV1 -from polaris.evaluate import BenchmarkResults, CompetitionPredictions, CompetitionResults +from polaris.dataset import Dataset, DatasetV1, DatasetV2 +from polaris.evaluate import BenchmarkResults, CompetitionPredictions from polaris.experimental._benchmark_v2 import BenchmarkV2Specification -from polaris.experimental._dataset_v2 import DatasetV2 from polaris.hub.external_client import ExternalAuthClient from polaris.hub.oauth import CachedTokenAuth from polaris.hub.settings import PolarisHubSettings @@ -40,7 +39,6 @@ ) from polaris.utils.types import ( AccessType, - ArtifactSubtype, ChecksumStrategy, HubOwner, SupportedLicenseType, @@ -347,7 +345,7 @@ def get_dataset( error_msg="Failed to fetch dataset.", ): try: - return self._get_v1_dataset(owner, name, ArtifactSubtype.STANDARD, verify_checksum) + return self._get_v1_dataset(owner, name, verify_checksum) except PolarisRetrieveArtifactError: # If the v1 dataset is not found, try to load a v2 dataset return self._get_v2_dataset(owner, name) @@ -356,25 +354,19 @@ def _get_v1_dataset( self, owner: str | HubOwner, name: str, - artifact_type: ArtifactSubtype, verify_checksum: ChecksumStrategy = "verify_unless_zarr", ) -> DatasetV1: - """Loads either a standard or competition dataset from Polaris Hub + """Loads a V1 dataset from Polaris Hub Args: owner: The owner of the dataset. Can be either a user or organization from the Polaris Hub. name: The name of the dataset. - artifact_type: indicates whether the artifact is of the standard or competition type. verify_checksum: Whether to use the checksum to verify the integrity of the dataset. Returns: A `Dataset` instance, if it exists. """ - url = ( - f"/v1/dataset/{owner}/{name}" - if artifact_type == ArtifactSubtype.STANDARD - else f"/v2/competition/dataset/{owner}/{name}" - ) + url = f"/v1/dataset/{owner}/{name}" response = self._base_request_to_hub(url=url, method="GET") response_data = response.json() @@ -391,12 +383,8 @@ def _get_v1_dataset( # It should be None if the dataset does not use pointer columns zarr_root_path = str(zarr_root_path) - if artifact_type == ArtifactSubtype.COMPETITION: - dataset = CompetitionDataset(table=table, zarr_root_path=zarr_root_path, **response_data) - md5sum = response_data["maskedMd5Sum"] - else: - dataset = DatasetV1(table=table, zarr_root_path=zarr_root_path, **response_data) - md5sum = response_data["md5Sum"] + dataset = DatasetV1(table=table, zarr_root_path=zarr_root_path, **response_data) + md5sum = response_data["md5Sum"] if dataset.should_verify_checksum(verify_checksum): dataset.verify_checksum(md5sum) @@ -628,14 +616,13 @@ def upload_dataset( ) if isinstance(dataset, DatasetV1): - self._upload_v1_dataset(dataset, ArtifactSubtype.STANDARD, timeout, access, owner, if_exists) + self._upload_v1_dataset(dataset, timeout, access, owner, if_exists) elif isinstance(dataset, DatasetV2): self._upload_v2_dataset(dataset, timeout, access, owner, if_exists) def _upload_v1_dataset( self, dataset: DatasetV1, - artifact_type: ArtifactSubtype, timeout: TimeoutTypes, access: AccessType, owner: HubOwner | str | None, @@ -673,11 +660,7 @@ def _upload_v1_dataset( # Step 1: Upload meta-data # Instead of directly uploading the data, we announce to the hub that we intend to upload it. # We do so separately for the Zarr archive and Parquet file. - url = ( - f"/v1/dataset/{dataset.artifact_id}" - if artifact_type == ArtifactSubtype.STANDARD - else f"/v2/competition/dataset/{dataset.owner}/{dataset.name}" - ) + url = f"/v1/dataset/{dataset.artifact_id}" self._base_request_to_hub( url=url, method="PUT", @@ -717,12 +700,9 @@ def _upload_v1_dataset( dataset.zarr_root.store.store, if_exists=if_exists, log=logger.info ) - base_artifact_url = ( - "datasets" if artifact_type == ArtifactSubtype.STANDARD else "/competition/datasets" - ) progress_indicator.update_success_msg( - f"Your {artifact_type} dataset has been successfully uploaded to the Hub. " - f"View it here: {urljoin(self.settings.hub_url, f'{base_artifact_url}/{dataset.owner}/{dataset.name}')}" + f"Your dataset has been successfully uploaded to the Hub. " + f"View it here: {urljoin(self.settings.hub_url, f'datasets/{dataset.owner}/{dataset.name}')}" ) def _upload_v2_dataset( @@ -784,9 +764,9 @@ def _upload_v2_dataset( dataset.zarr_root.store.store, if_exists=if_exists, log=logger.info ) - benchmark_url = urljoin(self.settings.hub_url, response.headers.get("Content-Location")) + dataset_url = urljoin(self.settings.hub_url, response.headers.get("Content-Location")) progress_indicator.update_success_msg( - f"Your V2 dataset has been successfully uploaded to the Hub. " f"View it here: {benchmark_url}" + f"Your V2 dataset has been successfully uploaded to the Hub. View it here: {dataset_url}" ) def upload_benchmark( @@ -818,18 +798,17 @@ def upload_benchmark( """ match benchmark: case BenchmarkV1Specification(): - self._upload_v1_benchmark(benchmark, ArtifactSubtype.STANDARD, access, owner) + self._upload_v1_benchmark(benchmark, access, owner) case BenchmarkV2Specification(): self._upload_v2_benchmark(benchmark, access, owner) def _upload_v1_benchmark( self, benchmark: BenchmarkV1Specification, - artifact_type: ArtifactSubtype, access: AccessType = "private", owner: HubOwner | str | None = None, ): - """Upload a standard or competition benchmark to the Polaris Hub. + """Upload a benchmark to the Polaris Hub. Info: Owner You have to manually specify the owner in the benchmark data model. Because the owner could @@ -847,7 +826,6 @@ def _upload_v1_benchmark( Args: benchmark: The benchmark to upload. - artifact_type: indicates whether the artifact is of the standard or competition type. access: Grant public or private access to result owner: Which Hub user or organization owns the artifact. Takes precedence over `benchmark.owner`. """ @@ -863,12 +841,12 @@ def _upload_v1_benchmark( benchmark_json["datasetArtifactId"] = benchmark.dataset.artifact_id benchmark_json["access"] = access - path_params = "/v1/benchmark" if artifact_type == ArtifactSubtype.STANDARD else "/v2/competition" + path_params = "/v1/benchmark" url = f"{path_params}/{benchmark.owner}/{benchmark.name}" self._base_request_to_hub(url=url, method="PUT", json=benchmark_json) progress_indicator.update_success_msg( - f"Your {artifact_type} benchmark has been successfully uploaded to the Hub. " + f"Your benchmark has been successfully uploaded to the Hub. " f"View it here: {urljoin(self.settings.hub_url, url)}" ) @@ -913,101 +891,60 @@ def _upload_v2_benchmark( f"Your benchmark has been successfully uploaded to the Hub. View it here: {benchmark_url}" ) - def get_competition( - self, - owner: str | HubOwner, - name: str, - verify_checksum: ChecksumStrategy = "verify_unless_zarr", - ) -> CompetitionSpecification: + def get_competition(self, artifact_id: str) -> CompetitionSpecification: """Load a competition from the Polaris Hub. Args: - owner: The owner of the competition. Can be either a user or organization from the Polaris Hub. - name: The name of the competition. - verify_checksum: Whether to use the checksum to verify the integrity of the dataset. + artifact_id: The artifact identifier for the competition Returns: A `CompetitionSpecification` instance, if it exists. """ - response = self._base_request_to_hub(url=f"/v2/competition/{owner}/{name}", method="GET") + url = f"/v1/competition/{artifact_id}" + response = self._base_request_to_hub(url=url, method="GET") response_data = response.json() - # TODO (jstlaurent): response["dataset"]["artifactId"] is the owner/name unique identifier, - # but we'd need to change the signature of get_dataset to use it - response_data["dataset"] = self._get_v1_dataset( - response_data["dataset"]["owner"]["slug"], - response_data["dataset"]["name"], - ArtifactSubtype.COMPETITION, - verify_checksum=verify_checksum, - ) - - if not verify_checksum: - response_data.pop("md5Sum", None) - - return CompetitionSpecification.model_construct(**response_data) - - def list_competitions(self, limit: int = 100, offset: int = 0) -> list[str]: - """List all available competitions on the Polaris Hub. - - Args: - limit: The maximum number of competitions to return. - offset: The offset from which to start returning competitions. + with StorageSession( + self, "read", CompetitionSpecification.urn_for(*artifact_id.split("/")) + ) as storage: + zarr_root_path = str(storage.paths.root) - Returns: - A list of competition names in the format `owner/competition_name`. - """ - with ProgressIndicator( - start_msg="Fetching artifacts...", - success_msg="Fetched artifacts.", - error_msg="Failed to fetch artifacts.", - ): - # TODO (cwognum): What to do with pagination, i.e. limit and offset? - response = self._base_request_to_hub( - url="/v2/competition", method="GET", params={"limit": limit, "offset": offset} - ) - response_data = response.json() - competitions_list = [f"{HubOwner(**bm['owner'])}/{bm['name']}" for bm in response_data["data"]] - return competitions_list + return CompetitionSpecification(zarr_root_path=zarr_root_path, **response_data) - def evaluate_competition( + def submit_competition_predictions( self, competition: CompetitionSpecification, competition_predictions: CompetitionPredictions, - ) -> CompetitionResults: - """Evaluate the predictions for a competition on the Polaris Hub. Target labels are fetched - by Polaris Hub and used only internally. + ): + """Submit predictions for a competition to the Polaris Hub. The Hub will evaluate them against + the secure test set and store the result. Args: competition: The competition to evaluate the predictions for. - competition_predictions: The predictions and associated metadata to be submitted for evaluation by the Hub. - - Returns: - A `CompetitionResults` object. + competition_predictions: The predictions and associated metadata to be submitted to the Hub. """ with ProgressIndicator( - start_msg="Evaluating competition predictions...", - success_msg="Evaluated competition predictions.", - error_msg="Failed to evaluate competition predictions.", + start_msg="Submitting competition predictions...", + success_msg="Submitted competition predictions.", + error_msg="Failed to submit competition predictions.", ) as progress_indicator: - competition.owner = HubOwner.normalize(competition.owner) - + # + # Prepare prediction payload for submission + prediction_json = competition_predictions.model_dump(by_alias=True, exclude_none=True) + prediction_payload = { + "competitionArtifactId": f"{competition.artifact_id}", + **prediction_json, + } + + # Submit payload to Hub response = self._base_request_to_hub( - url=f"/v2/competition/{competition.owner}/{competition.name}/evaluate", + url="/v1/competition-prediction", method="POST", - json=competition_predictions.model_dump(), + json=prediction_payload, ) - response_data = response.json() - # Inform the user about where to find their newly created artifact. - result_url = urljoin( - self.settings.hub_url, - f"/v2/competition/{competition.owner}/{competition.name}/{response_data['id']}", - ) + # Log success and return submission response progress_indicator.update_success_msg( - f"Your competition result has been successfully uploaded to the Hub. View it here: {result_url}" - ) - - scores = response_data["results"] - return CompetitionResults( - results=scores, competition_name=competition.name, competition_owner=competition.owner + "Your competition predictions have been successfully uploaded to the Hub for evaluation." ) + return response diff --git a/polaris/hub/storage.py b/polaris/hub/storage.py index b751084d..325c9534 100644 --- a/polaris/hub/storage.py +++ b/polaris/hub/storage.py @@ -175,7 +175,7 @@ def copy_to_destination( total_skipped += skipped log( - f"Copied {total_copied} ({total_bytes_copied / (1024 ** 2):.2f} MiB), skipped {total_skipped}, of {number_source_keys} keys. {(total_copied + total_skipped) / number_source_keys * 100:.2f}% completed." + f"Copied {total_copied} ({total_bytes_copied / (1024**2):.2f} MiB), skipped {total_skipped}, of {number_source_keys} keys. {(total_copied + total_skipped) / number_source_keys * 100:.2f}% completed." ) return total_copied, total_skipped, total_bytes_copied diff --git a/polaris/loader/__init__.py b/polaris/loader/__init__.py index 980f6dcf..835fe561 100644 --- a/polaris/loader/__init__.py +++ b/polaris/loader/__init__.py @@ -1,3 +1,3 @@ -from .load import load_benchmark, load_dataset +from .load import load_benchmark, load_dataset, load_competition -_all__ = ["load_benchmark", "load_dataset"] +_all__ = ["load_benchmark", "load_dataset", "load_competition"] diff --git a/polaris/loader/load.py b/polaris/loader/load.py index 48168761..306911c4 100644 --- a/polaris/loader/load.py +++ b/polaris/loader/load.py @@ -3,10 +3,7 @@ import fsspec from datamol.utils import fs -from polaris.benchmark._definitions import ( - MultiTaskBenchmarkSpecification, - SingleTaskBenchmarkSpecification, -) +from polaris.benchmark import MultiTaskBenchmarkSpecification, SingleTaskBenchmarkSpecification from polaris.dataset import DatasetV1, create_dataset_from_file from polaris.experimental._benchmark_v2 import BenchmarkV2Specification from polaris.hub.client import PolarisHubClient @@ -101,3 +98,18 @@ def load_benchmark(path: str, verify_checksum: ChecksumStrategy = "verify_unless benchmark.verify_checksum() return benchmark + + +def load_competition(artifact_id: str): + """ + Loads a Polaris competition. + + On Polaris, a competition represents a secure and fair benchmark. The target labels never exist + on the client and all results are evaluated through Polaris' servers. + + Note: Dataset is automatically loaded + The dataset underlying the competition is automatically loaded when loading the competition. + + """ + with PolarisHubClient() as client: + return client.get_competition(artifact_id) diff --git a/polaris/utils/types.py b/polaris/utils/types.py index eba9f5d2..b53761c6 100644 --- a/polaris/utils/types.py +++ b/polaris/utils/types.py @@ -160,6 +160,9 @@ The type of predictions expected by the metric interface. """ +ColumnName: TypeAlias = str +"""A column name in a dataset.""" + class HubOwner(BaseModel): """An owner of an artifact on the Polaris Hub @@ -199,10 +202,3 @@ class TaskType(Enum): MULTI_TASK = "multi_task" SINGLE_TASK = "single_task" - - -class ArtifactSubtype(Enum): - """The major artifact types which Polaris supports""" - - STANDARD = "standard" - COMPETITION = "competition" diff --git a/pyproject.toml b/pyproject.toml index d07b49a4..cf45271e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -52,7 +52,7 @@ dependencies = [ "seaborn", "datamol >=0.12.1", "fastpdb", - "zarr", + "zarr >=2,<3", "pyarrow < 18", "fsspec[http]", "yaspin", diff --git a/tests/conftest.py b/tests/conftest.py index 407e936d..53d16284 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -12,9 +12,8 @@ SingleTaskBenchmarkSpecification, ) from polaris.competition import CompetitionSpecification -from polaris.dataset import ColumnAnnotation, CompetitionDataset, DatasetFactory, DatasetV1 +from polaris.dataset import ColumnAnnotation, DatasetFactory, DatasetV1, DatasetV2 from polaris.dataset.converters import SDFConverter -from polaris.experimental._dataset_v2 import DatasetV2 from polaris.utils.types import HubOwner @@ -143,24 +142,6 @@ def test_dataset_v2(zarr_archive, test_org_owner) -> DatasetV2: return dataset -@pytest.fixture(scope="function") -def test_competition_dataset(test_data, test_org_owner): - dataset = CompetitionDataset( - table=test_data, - name="test-competition-dataset", - source="https://www.example.com", - annotations={"expt": ColumnAnnotation(user_attributes={"unit": "kcal/mol"})}, - tags=["tagA", "tagB"], - user_attributes={"attributeA": "valueA", "attributeB": "valueB"}, - owner=test_org_owner, - license="CC-BY-4.0", - curation_reference="https://www.example.com", - ) - - check_version(dataset) - return dataset - - @pytest.fixture(scope="function") def zarr_archive(tmp_path): tmp_path = fs.join(tmp_path, "data.zarr") @@ -337,18 +318,32 @@ def test_multi_task_benchmark_clf(test_dataset, classification_metrics): @pytest.fixture(scope="function") -def test_competition(test_competition_dataset, test_org_owner, regression_metrics): +def test_competition(zarr_archive, test_org_owner, regression_metrics, test_dataset_v2): train_indices = list(range(90)) test_indices = list(range(90, 100)) competition = CompetitionSpecification( + # Base attributes name="test-competition", - dataset=test_competition_dataset, owner=test_org_owner, + tags=["tagA", "tagB"], + user_attributes={"attributeA": "valueA", "attributeB": "valueB"}, + # Benchmark attributes metrics=regression_metrics, main_metric="mean_absolute_error", split=(train_indices, test_indices), - target_cols="expt", - input_cols="smiles", + input_cols=["A"], + target_cols=["B"], + readme="Testing specification", + # Dataset attributes + source="https://www.example.com", + annotations={"A": ColumnAnnotation(user_attributes={"unit": "kcal/mol"})}, + license="CC-BY-4.0", + curation_reference="https://www.example.com", + zarr_root_path=zarr_archive, + # Competition attributes + start_time="2025-01-13T21:59:38Z", + end_time="2025-01-20T21:59:38Z", + n_classes={"B": 0}, ) check_version(competition) return competition diff --git a/tests/test_benchmark.py b/tests/test_benchmark.py index c343ad84..34f6ec18 100644 --- a/tests/test_benchmark.py +++ b/tests/test_benchmark.py @@ -224,7 +224,7 @@ def test_checksum_verification(test_single_task_benchmark): def test_benchmark_duplicate_metrics(test_single_task_benchmark): - """Test whether setting an invalid checksum raises an error.""" + """Tests that passing duplicate metrics will raise a validation error""" m = test_single_task_benchmark.model_dump() with pytest.raises(ValidationError, match="The benchmark specifies duplicate metric"): @@ -244,7 +244,7 @@ def test_benchmark_duplicate_metrics(test_single_task_benchmark): def test_benchmark_metric_deserialization(test_single_task_benchmark): - """Test whether setting an invalid checksum raises an error.""" + """Tests that passing metrics as a list of strings or dictionaries works as expected""" m = test_single_task_benchmark.model_dump() # Should work with strings diff --git a/tests/test_benchmark_v2.py b/tests/test_benchmark_v2.py index 81d0c664..732c4b67 100644 --- a/tests/test_benchmark_v2.py +++ b/tests/test_benchmark_v2.py @@ -2,7 +2,8 @@ from pydantic import ValidationError from pyroaring import BitMap -from polaris.experimental._benchmark_v2 import BenchmarkV2Specification, IndexSet, SplitV2 +from polaris.experimental._benchmark_v2 import BenchmarkV2Specification +from polaris.experimental._split_v2 import IndexSet, SplitV2 @pytest.fixture diff --git a/tests/test_competition.py b/tests/test_competition.py new file mode 100644 index 00000000..3315a0b4 --- /dev/null +++ b/tests/test_competition.py @@ -0,0 +1,171 @@ +from itertools import chain + +import pytest +from pydantic import ValidationError + +from polaris.competition import CompetitionSpecification +from polaris.evaluate import Metric +from polaris.utils.types import TaskType + + +def test_competition_split_verification(test_competition): + """Verifies that the split validation works as expected.""" + + obj = test_competition + cls = CompetitionSpecification + + # By using the fixture as a default, we know it doesn't always fail + default_kwargs = { + "target_cols": obj.target_cols, + "input_cols": obj.input_cols, + "metrics": obj.metrics, + "main_metric": obj.main_metric, + "name": obj.name, + "zarr_root_path": obj.zarr_root_path, + "readme": obj.readme, + "start_time": obj.start_time, + "end_time": obj.end_time, + "n_test_sets": obj.n_test_sets, + "n_test_datapoints": obj.n_test_datapoints, + "n_classes": obj.n_classes, + } + + train_split = obj.split[0] + test_split = obj.split[1] + + # One or more empty test partitions + with pytest.raises(ValidationError): + cls(split=(train_split,), **default_kwargs) + with pytest.raises(ValidationError): + cls(split=(train_split, []), **default_kwargs) + with pytest.raises(ValidationError): + cls(split=(train_split, {"test": []}), **default_kwargs) + # Non-exclusive partitions + with pytest.raises(ValidationError): + cls(split=(train_split, test_split["test"] + train_split[:1]), **default_kwargs) + with pytest.raises(ValidationError): + cls(split=(train_split, {"test1": test_split, "test2": train_split[:1]}), **default_kwargs) + # Invalid indices + with pytest.raises(ValidationError): + cls(split=(train_split + [len(obj)], test_split), **default_kwargs) + with pytest.raises(ValidationError): + cls(split=(train_split + [-1], test_split), **default_kwargs) + with pytest.raises(ValidationError): + cls(split=(train_split, test_split["test"] + [len(obj)]), **default_kwargs) + with pytest.raises(ValidationError): + cls(split=(train_split, test_split["test"] + [-1]), **default_kwargs) + # Duplicate indices + with pytest.raises(ValidationError): + cls(split=(train_split + train_split[:1], test_split), **default_kwargs) + with pytest.raises(ValidationError): + cls(split=(train_split, test_split["test"] + test_split["test"][:1]), **default_kwargs) + with pytest.raises(ValidationError): + cls( + split=(train_split, {"test1": test_split, "test2": test_split["test"] + test_split["test"][:1]}), + **default_kwargs, + ) + + # It should _not_ fail with duplicate indices across test partitions + cls(split=(train_split, {"test1": test_split["test"], "test2": test_split["test"]}), **default_kwargs) + # It should _not_ fail with missing indices + cls(split=(train_split[:-1], test_split), **default_kwargs) + # It should _not_ fail with an empty train set + competition = cls(split=([], test_split), **default_kwargs) + train, _ = competition.get_train_test_split() + assert len(train) == 0 + + +@pytest.mark.parametrize("cls", [CompetitionSpecification]) +def test_competition_metrics_verification(test_competition, cls): + """Verifies that the metric validation works as expected.""" + # By using the fixture as a default, we know it doesn't always fail + base = test_competition + + default_kwargs = { + "target_cols": base.target_cols, + "input_cols": base.input_cols, + "main_metric": base.main_metric, + "name": base.name, + "zarr_root_path": base.zarr_root_path, + "readme": base.readme, + "start_time": base.start_time, + "end_time": base.end_time, + "n_test_sets": base.n_test_sets, + "n_test_datapoints": base.n_test_datapoints, + "n_classes": base.n_classes, + } + + # Invalid metric + with pytest.raises(ValidationError): + cls(metrics=["invalid"], **default_kwargs) + with pytest.raises(ValidationError): + cls(metrics="invalid", **default_kwargs) + with pytest.raises(ValidationError): + metrics_list = list(base.metrics) + cls( + metrics=metrics_list + [metrics_list[0]], + **default_kwargs, + ) + + +def test_competition_duplicate_metrics(test_competition): + """Tests that passing duplicate metrics will raise a validation error""" + m = test_competition.model_dump() + + with pytest.raises(ValidationError, match="The benchmark specifies duplicate metric"): + m["metrics"] = [ + Metric(label="roc_auc", config={"group_by": "CLASS_expt"}), + Metric(label="roc_auc", config={"group_by": "CLASS_expt"}), + ] + m["main_metric"] = m["metrics"][0] + CompetitionSpecification(**m) + + with pytest.raises(ValidationError, match="The metrics of a benchmark need to have unique names."): + m["metrics"][0].config.group_by = "MULTICLASS_calc" + CompetitionSpecification(**m) + + m["metrics"][0].custom_name = "custom_name" + CompetitionSpecification(**m) + + +def test_competition_metric_deserialization(test_competition): + """Tests that passing metrics as a list of strings or dictionaries works as expected""" + m = test_competition.model_dump() + + # Should work with strings + m["metrics"] = ["mean_absolute_error", "accuracy"] + m["main_metric"] = "accuracy" + CompetitionSpecification(**m) + + # Should work with dictionaries + m["metrics"] = [ + {"label": "mean_absolute_error", "config": {"group_by": "CLASS_expt"}}, + {"label": "accuracy"}, + ] + CompetitionSpecification(**m) + + +def test_competition_train_test_split(test_competition): + """Tests that the competition's train/test split can be retrieved through a CompetitionSpecification instance""" + + train, test = test_competition.get_train_test_split() + + train_split = test_competition.split[0] + test_sets = test_competition.split[1] + test_split = set(chain.from_iterable(test_sets.values())) + + assert len(train) == len(train_split) + assert len(test) == len(test_split) + + +def test_competition_computed_fields(test_competition): + default_test_set_name = "test" + assert test_competition.task_type == TaskType.SINGLE_TASK.value + assert test_competition.test_set_labels == [default_test_set_name] + assert test_competition.test_set_sizes == {default_test_set_name: 10} + + +def test_competition_interface(test_competition): + """Tests that the CompetitionSpecification class doesn't accidentally inherit the evaluate method from the benchmark class""" + with pytest.raises(AttributeError): + test_competition.evaluate() diff --git a/tests/test_dataset_v2.py b/tests/test_dataset_v2.py index a9c18137..c664ded0 100644 --- a/tests/test_dataset_v2.py +++ b/tests/test_dataset_v2.py @@ -9,11 +9,11 @@ import zarr from pydantic import ValidationError -from polaris.dataset import Subset +from polaris.dataset import DatasetV2, Subset +from polaris.dataset._dataset_v2 import _INDEX_ARRAY_KEY from polaris.dataset._factory import DatasetFactory from polaris.dataset.converters._pdb import PDBConverter from polaris.dataset.zarr._manifest import generate_zarr_manifest -from polaris.experimental._dataset_v2 import _INDEX_ARRAY_KEY, DatasetV2 def test_dataset_v2_get_columns(test_dataset_v2):