diff --git a/experiments_engine/data.py b/experiments_engine/data.py index a775bed..b8605bc 100644 --- a/experiments_engine/data.py +++ b/experiments_engine/data.py @@ -1,3 +1,5 @@ +from typing import Any + import numpy as np import pandas as pd from openml import OpenMLTask @@ -14,7 +16,7 @@ def move_target_to_last_column( return df -def is_eligible_task(task: OpenMLTask) -> bool: +def is_eligible_task(task: OpenMLTask) -> Any: if task is None: return False if ( diff --git a/experiments_engine/data_utils.py b/experiments_engine/data_utils.py index 03d9eda..adf667d 100644 --- a/experiments_engine/data_utils.py +++ b/experiments_engine/data_utils.py @@ -1,6 +1,6 @@ import json from pathlib import Path -from typing import Tuple +from typing import Any, Tuple import numpy as np import pandas as pd @@ -18,7 +18,7 @@ def get_dataset_from_path(path: Path) -> Tuple[Tensor, Tensor]: dataset = pd.read_parquet(path / "test.parquet") X, y = dataset.iloc[:, :-1], dataset.iloc[:, -1] pipeline = DataUtils.get_preprocessing_pipeline() - X = pipeline.fit_transform(X) # type: ignore + X = pipeline.fit_transform(X) X = Tensor(X.values).cuda() y = Tensor(y.values).reshape(-1, 1).cuda() return X, y @@ -26,7 +26,7 @@ def get_dataset_from_path(path: Path) -> Tuple[Tensor, Tensor]: def load_datasets_with_landmarkers( reduce_landmarkers_dimensionality: bool = False, -): +) -> Tuple[dict[str, Any], dict[str, Any], dict[str, Any], dict[str, Any]]: train_datasets_names = list( sorted( [ @@ -91,7 +91,7 @@ def load_datasets_with_landmarkers( def __project_landmarkers_to_smaller_space( train_landmarkers: dict[str, Tensor], val_landmarkers: dict[str, Tensor] -): +) -> Tuple[dict[str, Tensor], dict[str, Tensor]]: projection_train_data = np.stack( list([item.cpu().numpy() for item in train_landmarkers.values()]) ) @@ -100,7 +100,7 @@ def __project_landmarkers_to_smaller_space( scaling = StandardScaler().fit(scaling_train_data) out_train_landmarkers = { name: Tensor( - scaling.transform( # type: ignore + scaling.transform( projection.transform(value.cpu().numpy().reshape(1, -1)) )[0] ) @@ -108,7 +108,7 @@ def __project_landmarkers_to_smaller_space( } out_val_landmarkers = { name: Tensor( - scaling.transform( # type: ignore + scaling.transform( projection.transform(value.cpu().numpy().reshape(1, -1)) )[0] ) diff --git a/experiments_engine/hpo.py b/experiments_engine/hpo.py index f3adf0b..11827ae 100644 --- a/experiments_engine/hpo.py +++ b/experiments_engine/hpo.py @@ -4,6 +4,7 @@ import optuna import pandas as pd from dataset2vec.utils import DataUtils +from numpy.typing import NDArray from sklearn.base import BaseEstimator from sklearn.metrics import roc_auc_score from xgboost import XGBClassifier @@ -15,7 +16,7 @@ def __init__( self, df_train: pd.DataFrame, df_test: pd.DataFrame, - metric: Callable = roc_auc_score, + metric: Callable[[NDArray[Any], NDArray[Any]], float] = roc_auc_score, ): self.X_train, self.y_train = ( df_train.iloc[:, :-1], @@ -23,16 +24,16 @@ def __init__( ) self.X_test, self.y_test = df_test.iloc[:, :-1], df_test.iloc[:, -1] self.pipeline = DataUtils.get_preprocessing_pipeline() - self.X_train = self.pipeline.fit_transform(self.X_train) # type: ignore # noqa - self.X_test = self.pipeline.transform(self.X_test) # type: ignore + self.X_train = self.pipeline.fit_transform(self.X_train) + self.X_test = self.pipeline.transform(self.X_test) self.metric = metric def __call__(self, trial: optuna.Trial) -> float: X_train, y_train, X_test, y_test = self.get_data() model = self.get_model(trial) - model.fit(X_train, y_train) # type: ignore - probas = model.predict_proba(X_test)[:, 1] # type: ignore - return self.metric(y_test, probas) # type: ignore + model.fit(X_train, y_train) + probas = model.predict_proba(X_test)[:, 1] + return self.metric(y_test, probas) @abstractmethod def get_model(self, trial: optuna.Trial) -> BaseEstimator: @@ -83,7 +84,7 @@ def perform_study( return study -def get_best_study_params(study: optuna.Study) -> dict: +def get_best_study_params(study: optuna.Study) -> dict[str, Any]: return study.best_params diff --git a/experiments_engine/portfolio_selection.py b/experiments_engine/portfolio_selection.py index 0101817..9a603b9 100644 --- a/experiments_engine/portfolio_selection.py +++ b/experiments_engine/portfolio_selection.py @@ -1,20 +1,22 @@ +from typing import Any + import numpy as np from numpy.typing import NDArray from scipy.stats import rankdata def extract_best_configuration_idx_from_cluster_eval_results( - datasets_inside_clusters_performances: NDArray, + datasets_inside_clusters_performances: NDArray[Any], ) -> int: return get_ranks_of_hp_configurations( datasets_inside_clusters_performances )[0] -def get_ranks_of_hp_configurations(hp_performances: NDArray) -> list[int]: +def get_ranks_of_hp_configurations(hp_performances: NDArray[Any]) -> list[int]: ranks_per_dataset = np.array( [rankdata(-row, method="dense") for row in hp_performances] ) average_ranks_per_configuration = ranks_per_dataset.mean(axis=0) final_ranks = np.argsort(average_ranks_per_configuration) - return final_ranks.tolist() + return list(final_ranks) diff --git a/experiments_engine/utils.py b/experiments_engine/utils.py index 9476ebb..617e61e 100644 --- a/experiments_engine/utils.py +++ b/experiments_engine/utils.py @@ -1,12 +1,13 @@ import json from pathlib import Path +from typing import Any import torch -device = "cuda" if torch.cuda.is_available else "cpu" +device = "cuda" if torch.cuda.is_available() else "cpu" -def read_json(path: Path) -> dict: +def read_json(path: Path) -> Any: with open(path, "r") as f: return json.load(f) diff --git a/experiments_engine/warmstart_utils.py b/experiments_engine/warmstart_utils.py index a15cce0..f27f72d 100644 --- a/experiments_engine/warmstart_utils.py +++ b/experiments_engine/warmstart_utils.py @@ -37,7 +37,7 @@ def perform_ground_truth_warm_start_experiment( ) if n_initial_trials > 0: initial_trials = warm_starter.propose_configurations( - objective_landmarkers, # type: ignore + objective_landmarkers, n_initial_trials, ) for trial in initial_trials: diff --git a/requirements_dev.txt b/requirements_dev.txt new file mode 100644 index 0000000..7349580 --- /dev/null +++ b/requirements_dev.txt @@ -0,0 +1,15 @@ +dataset2vec==1.0.0 +openml==0.14.2 +loguru==0.7.2 +optuna==3.6.1 +xgboost==2.0.3 +scikit-learn==1.4.2 +pytest==8.2.1 +pymfe==0.4.3 +seaborn==0.13.2 +tensorboard==2.17.0 +numpy==1.26.4 +isort==5.13.2 +black==24.8.0 +flake8==7.1.0 +mypy==1.11.1 \ No newline at end of file diff --git a/scripts/check_code.sh b/scripts/check_code.sh new file mode 100755 index 0000000..3373a30 --- /dev/null +++ b/scripts/check_code.sh @@ -0,0 +1,23 @@ +#!/bin/bash +set -e + +export PATHS_TO_CHECK="wsmf experiments_engine test" + +echo "Running isort" +isort --profile=black --line-length=79 $PATHS_TO_CHECK + +echo "Running black" +black --line-length=79 $PATHS_TO_CHECK + +echo "Running flake8" +flake8 --ignore=W605,W503 --exclude experiments_engine/cd_plot.py $PATHS_TO_CHECK + +echo "Running mypy" +mypy \ + --install-types \ + --non-interactive \ + --ignore-missing-imports \ + --strict \ + --namespace-packages \ + --exclude experiments_engine/cd_plot.py \ + $PATHS_TO_CHECK diff --git a/run_all.sh b/scripts/run_all_experiments.sh similarity index 100% rename from run_all.sh rename to scripts/run_all_experiments.sh diff --git a/test/experiments_engine/test_data.py b/test/experiments_engine/test_data.py index 5ef2c77..c5bd945 100644 --- a/test/experiments_engine/test_data.py +++ b/test/experiments_engine/test_data.py @@ -1,4 +1,4 @@ -from unittest.mock import patch +from unittest.mock import Mock, patch import numpy as np import pandas as pd @@ -10,7 +10,7 @@ ) -def test_move_target_to_last_column_when_target_last(): +def test_move_target_to_last_column_when_target_last() -> None: # Given df = pd.DataFrame({"col1": [1, 2], "col2": [1, 2]}) @@ -21,7 +21,7 @@ def test_move_target_to_last_column_when_target_last(): assert (actual_df == df).all(axis=None) -def test_move_target_to_last_column_when_target_not_last(): +def test_move_target_to_last_column_when_target_not_last() -> None: # Given df = pd.DataFrame({"col1": [1, 2], "col2": [1, 2]}) @@ -32,7 +32,7 @@ def test_move_target_to_last_column_when_target_not_last(): assert (actual_df == df[["col2", "col1"]]).all(axis=None) -def test_remove_unwanted_columns(): +def test_remove_unwanted_columns() -> None: # Given df = pd.DataFrame({"id_1": [1], "2id": [2], "3_id": [3], "col": [4]}) @@ -43,7 +43,7 @@ def test_remove_unwanted_columns(): assert (actual_df == df[["2id", "col"]]).all(axis=None) -def test_clean_and_binarize_classification_multiple_classes(): +def test_clean_and_binarize_classification_multiple_classes() -> None: # Given df = pd.DataFrame({"col": [1, 2, 3, 4], "target": [1, 2, 3, 1]}) @@ -56,7 +56,7 @@ def test_clean_and_binarize_classification_multiple_classes(): assert actual_df["target"].max() == 1 -def test_clean_and_binarize_classification_text_classes(): +def test_clean_and_binarize_classification_text_classes() -> None: # Given df = pd.DataFrame({"col": [1, 2, 3, 4], "target": ["A", "B", "C", "A"]}) @@ -71,8 +71,8 @@ def test_clean_and_binarize_classification_text_classes(): @patch("numpy.random.uniform") def test_clean_and_binarize_classification_when_subset_only_zeros( - uniform_mock, -): + uniform_mock: Mock, +) -> None: uniform_mock.return_value = np.array([0.1, 0.2, 0.3]) # Given df = pd.DataFrame({"col": [1, 2, 3, 4], "target": ["A", "B", "C", "A"]}) @@ -88,8 +88,8 @@ def test_clean_and_binarize_classification_when_subset_only_zeros( @patch("numpy.random.uniform") def test_clean_and_binarize_classification_when_subset_only_ones( - uniform_mock, -): + uniform_mock: Mock, +) -> None: uniform_mock.return_value = np.array([0.6, 0.7, 0.8]) # Given df = pd.DataFrame({"col": [1, 2, 3, 4], "target": ["A", "B", "C", "A"]}) diff --git a/test/experiments_engine/test_portfolio_choice.py b/test/experiments_engine/test_portfolio_choice.py index 89a79e8..5da3962 100644 --- a/test/experiments_engine/test_portfolio_choice.py +++ b/test/experiments_engine/test_portfolio_choice.py @@ -5,7 +5,7 @@ ) -def test_extract_best_configuration_idx_from_cluster_eval_results(): +def test_extract_best_configuration_idx_from_cluster_eval_results() -> None: # Given datasets_inside_clusters_performances = np.array( [ diff --git a/test/wsmf/metamodels/data/test_dataset.py b/test/wsmf/metamodels/data/test_dataset.py index e6f7b6c..32d2c37 100644 --- a/test/wsmf/metamodels/data/test_dataset.py +++ b/test/wsmf/metamodels/data/test_dataset.py @@ -4,7 +4,7 @@ from wsmf.metamodels.data import EncoderHpoDataset -def test_d2v_hpo_dataset_has_proper_length(): +def test_d2v_hpo_dataset_has_proper_length() -> None: # Given dataset1_X = Tensor([[1, 2, 3], [4, 5, 6]]) dataset1_y = Tensor([[0], [1]]) @@ -26,7 +26,7 @@ def test_d2v_hpo_dataset_has_proper_length(): assert len(d2v_hpo_dataset) == 2 -def test_d2v_hpo_dataset_has_proper_dataset_names(): +def test_d2v_hpo_dataset_has_proper_dataset_names() -> None: # Given dataset1_X = Tensor([[1, 2, 3], [4, 5, 6]]) dataset1_y = Tensor([[0], [1]]) @@ -48,7 +48,7 @@ def test_d2v_hpo_dataset_has_proper_dataset_names(): assert d2v_hpo_dataset.dataset_names == ["dataset1", "dataset2"] -def test_d2v_hpo_dataset_returns_proper_data_on_index(): +def test_d2v_hpo_dataset_returns_proper_data_on_index() -> None: # Given dataset1_X = Tensor([[1, 2, 3], [4, 5, 6]]) dataset1_y = Tensor([[0], [1]]) @@ -75,7 +75,7 @@ def test_d2v_hpo_dataset_returns_proper_data_on_index(): assert (actual_landmarkers == Tensor([-1, -2, -3])).all() -def test_d2v_hpo_dataset_fail_when_inconsistent_data_sizes(): +def test_d2v_hpo_dataset_fail_when_inconsistent_data_sizes() -> None: # Given dataset1_X = Tensor([[1, 2, 3], [4, 5, 6]]) dataset1_y = Tensor([[0], [1]]) diff --git a/test/wsmf/metamodels/data/test_landmarker_reconstruction.py b/test/wsmf/metamodels/data/test_landmarker_reconstruction.py index 24772d4..32af9eb 100644 --- a/test/wsmf/metamodels/data/test_landmarker_reconstruction.py +++ b/test/wsmf/metamodels/data/test_landmarker_reconstruction.py @@ -11,7 +11,7 @@ @patch("numpy.random.choice") def test_landmarker_reconstruction_loader_returns_proper_sample( choice_mock: Mock, -): +) -> None: # Given choice_mock.return_value = [0, 1] dataset1_X = Tensor([[1, 2, 3], [4, 5, 6]]) @@ -31,10 +31,10 @@ def test_landmarker_reconstruction_loader_returns_proper_sample( # Then assert (sample[0] == dataset1_X).all() assert (sample[1] == dataset1_y).all() - assert (sample[2] == landmarkers["dataset1"]).all() + assert (sample[2] == landmarkers["dataset1"]).all() # type: ignore -def test_landmarker_reconstruction_loader_returns_proper_batch_size(): +def test_landmarker_reconstruction_loader_returns_proper_batch_size() -> None: # Given dataset1_X = Tensor([[1, 2, 3], [4, 5, 6]]) dataset1_y = Tensor([[0], [1]]) @@ -58,7 +58,7 @@ def test_landmarker_reconstruction_loader_returns_proper_batch_size(): assert len(batch) == 2 -def test_landmarker_reconstruction_loader_returns_all_datasets(): +def test_landmarker_reconstruction_loader_returns_all_datasets() -> None: # Given dataset1_X = Tensor([[1, 2, 3], [4, 5, 6]]) dataset1_y = Tensor([[0], [1]]) @@ -81,7 +81,7 @@ def test_landmarker_reconstruction_loader_returns_all_datasets(): # Then assert (returned_datasets[0][0][0] == dataset1_X).all() assert (returned_datasets[0][0][1] == dataset1_y).all() - assert (returned_datasets[0][0][2] == landmarkers["dataset1"]).all() + assert (returned_datasets[0][0][2] == landmarkers["dataset1"]).all() # type: ignore # noqa E501 assert (returned_datasets[1][0][0] == dataset2_X).all() assert (returned_datasets[1][0][1] == dataset2_y).all() - assert (returned_datasets[1][0][2] == landmarkers["dataset2"]).all() + assert (returned_datasets[1][0][2] == landmarkers["dataset2"]).all() # type: ignore # noqa E501 diff --git a/test/wsmf/metamodels/data/test_metric_loader.py b/test/wsmf/metamodels/data/test_metric_loader.py index 4c57f98..56be97f 100644 --- a/test/wsmf/metamodels/data/test_metric_loader.py +++ b/test/wsmf/metamodels/data/test_metric_loader.py @@ -7,7 +7,9 @@ @patch("numpy.random.choice") -def test_encoder_metric_loader_calculates_sample_properly(choice_mock: Mock): +def test_encoder_metric_loader_calculates_sample_properly( + choice_mock: Mock, +) -> None: # Given choice_mock.return_value = [0, 1] dataset1_X = Tensor([[1, 2, 3], [4, 5, 6]]) @@ -40,7 +42,7 @@ def test_encoder_metric_loader_calculates_sample_properly(choice_mock: Mock): assert np.isclose(sample[4], 56 / 3) -def test_encoder_metric_loader_returns_proper_number_of_batches(): +def test_encoder_metric_loader_returns_proper_number_of_batches() -> None: # Given dataset1_X = Tensor([[1, 2, 3], [4, 5, 6]]) dataset1_y = Tensor([[0], [1]]) @@ -64,7 +66,7 @@ def test_encoder_metric_loader_returns_proper_number_of_batches(): assert len(batches) == 16 -def test_encoder_metric_loader_returns_batch_with_proper_size(): +def test_encoder_metric_loader_returns_batch_with_proper_size() -> None: # Given dataset1_X = Tensor([[1, 2, 3], [4, 5, 6]]) dataset1_y = Tensor([[0], [1]]) diff --git a/test/wsmf/metamodels/data/test_repeatable.py b/test/wsmf/metamodels/data/test_repeatable.py index 955c4d2..d9b5623 100644 --- a/test/wsmf/metamodels/data/test_repeatable.py +++ b/test/wsmf/metamodels/data/test_repeatable.py @@ -7,7 +7,7 @@ ) -def test_encoder_metric_loader_calculates_sample_properly(): +def test_encoder_metric_loader_calculates_sample_properly() -> None: # Given dataset1_X = Tensor([[1, 2, 3], [4, 5, 6]]) dataset1_y = Tensor([[0], [1]]) diff --git a/test/wsmf/metamodels/networks/test_d2v_metric.py b/test/wsmf/metamodels/networks/test_d2v_metric.py index 65c5e43..38b148e 100644 --- a/test/wsmf/metamodels/networks/test_d2v_metric.py +++ b/test/wsmf/metamodels/networks/test_d2v_metric.py @@ -6,7 +6,7 @@ @patch("dataset2vec.Dataset2Vec.forward") -def test_d2v_metric_forward(dataset2vec_mock: Mock): +def test_d2v_metric_forward(dataset2vec_mock: Mock) -> None: # Given dataset2vec_mock.return_value = Tensor([1.0, 2.0, 3.0]) meta_model = Dataset2VecMetricLearning() diff --git a/test/wsmf/metamodels/networks/test_d2v_reconstruction.py b/test/wsmf/metamodels/networks/test_d2v_reconstruction.py index 4304ca3..cfb2a69 100644 --- a/test/wsmf/metamodels/networks/test_d2v_reconstruction.py +++ b/test/wsmf/metamodels/networks/test_d2v_reconstruction.py @@ -5,7 +5,7 @@ from wsmf.metamodels.networks import Dataset2VecForLandmarkerReconstruction -def test_meta_model_returns_output_of_proper_dimensionality(): +def test_meta_model_returns_output_of_proper_dimensionality() -> None: # Given meta_model = Dataset2VecForLandmarkerReconstruction(4) X, y = rand((10, 5)), rand((10, 1)) @@ -17,13 +17,13 @@ def test_meta_model_returns_output_of_proper_dimensionality(): assert reconstructed_landmarkers.shape == Size([4]) -def test_meta_model_uses_reconstructor(): +def test_meta_model_uses_reconstructor() -> None: # Given meta_model = Dataset2VecForLandmarkerReconstruction(3) encoder_mock = Mock(return_value=Tensor([1, 2, 3])) meta_model.dataset2vec.forward = encoder_mock reconstructor_mock = Mock(return_value=Tensor([4, 5, 6])) - meta_model.landmarker_reconstructor.forward = reconstructor_mock + meta_model.landmarker_reconstructor.forward = reconstructor_mock # type: ignore # noqa: E501 X, y = rand((10, 5)), rand((10, 1)) # When diff --git a/test/wsmf/metamodels/train/test_interface.py b/test/wsmf/metamodels/train/test_interface.py index 4c0258d..4a5cd9a 100644 --- a/test/wsmf/metamodels/train/test_interface.py +++ b/test/wsmf/metamodels/train/test_interface.py @@ -9,7 +9,7 @@ class MockImplementation(TrainingInterface): - def forward(self, *args, **kwargs) -> Tensor: + def forward(self, *args, **kwargs) -> Tensor: # type: ignore return Tensor([0]) def calculate_datasets_similarity( @@ -17,13 +17,13 @@ def calculate_datasets_similarity( ) -> Tensor: return Tensor([0.5]) - def configure_optimizers( + def configure_optimizers( # type: ignore self, ) -> tuple[list[Optimizer], list[dict[str, Any]]]: return None # type: ignore -def test_on_train_epoch_start(): +def test_on_train_epoch_start() -> None: # Given interface = MockImplementation(OptimizerConfig(), lambda x, y: Tensor([0])) @@ -43,7 +43,7 @@ def test_on_train_epoch_start(): def test_training_step( calculate_loss_mock: Mock, extract_labels_and_similarities_from_batch_mock: Mock, -): +) -> None: # Given extract_labels_and_similarities_from_batch_mock.return_value = ( Tensor([1.0, 2.0, 3.0]), @@ -66,7 +66,7 @@ def test_training_step( assert (actual_output["predictions"] == Tensor([4.0, 5.0, 6.0])).all() -def test_on_train_batch_end(): +def test_on_train_batch_end() -> None: # Given interface = MockImplementation(OptimizerConfig(), lambda x, y: Tensor([0])) interface.on_train_epoch_start() @@ -89,7 +89,7 @@ def test_on_train_batch_end(): @patch("wsmf.metamodels.train.interface.TrainingInterface.calculate_loss") -def test_on_train_epoch_end(calculate_loss_mock: Mock): +def test_on_train_epoch_end(calculate_loss_mock: Mock) -> None: # Given calculate_loss_mock.return_value = Tensor([0.0]) interface = MockImplementation(OptimizerConfig(), lambda x, y: Tensor([0])) @@ -106,7 +106,7 @@ def test_on_train_epoch_end(calculate_loss_mock: Mock): assert (calculate_loss_args[0][1] == Tensor([0.1, 0.2, 0.3, 0.4])).all() -def test_on_validation_epoch_start(): +def test_on_validation_epoch_start() -> None: # Given interface = MockImplementation(OptimizerConfig(), lambda x, y: Tensor([0])) @@ -126,7 +126,7 @@ def test_on_validation_epoch_start(): def test_validation_step( calculate_loss_mock: Mock, extract_labels_and_similarities_from_batch_mock: Mock, -): +) -> None: # Given extract_labels_and_similarities_from_batch_mock.return_value = ( Tensor([1.0, 2.0, 3.0]), @@ -149,7 +149,7 @@ def test_validation_step( assert (actual_output["predictions"] == Tensor([4.0, 5.0, 6.0])).all() -def test_on_validation_batch_end(): +def test_on_validation_batch_end() -> None: # Given interface = MockImplementation(OptimizerConfig(), lambda x, y: Tensor([0])) interface.on_validation_epoch_start() @@ -172,7 +172,7 @@ def test_on_validation_batch_end(): @patch("wsmf.metamodels.train.interface.TrainingInterface.calculate_loss") -def test_on_validation_epoch_end(calculate_loss_mock: Mock): +def test_on_validation_epoch_end(calculate_loss_mock: Mock) -> None: # Given calculate_loss_mock.return_value = Tensor([0.0]) interface = MockImplementation(OptimizerConfig(), lambda x, y: Tensor([0])) @@ -189,7 +189,7 @@ def test_on_validation_epoch_end(calculate_loss_mock: Mock): assert (calculate_loss_args[0][1] == Tensor([0.1, 0.2, 0.3, 0.4])).all() -def test_extract_labels_and_similarities_from_batch(): +def test_extract_labels_and_similarities_from_batch() -> None: # Given batch = [ (rand(10, 5), rand(10, 1), rand(5, 3), rand(5, 1), 1), diff --git a/test/wsmf/metamodels/train/test_metric.py b/test/wsmf/metamodels/train/test_metric.py index 015a753..821c8d5 100644 --- a/test/wsmf/metamodels/train/test_metric.py +++ b/test/wsmf/metamodels/train/test_metric.py @@ -1,8 +1,8 @@ from typing import Any, Callable -import torch from dataset2vec.config import OptimizerConfig from torch import Tensor, rand +from torch.optim.optimizer import Optimizer from wsmf.metamodels.train import MetricLearningTrainingInterface @@ -23,13 +23,13 @@ def __init__( def forward(self, X: Tensor, y: Tensor) -> Tensor: return next(self.mock_encodings_generator) - def configure_optimizers( + def configure_optimizers( # type: ignore self, - ) -> tuple[list[torch.optim.Optimizer], list[dict[str, Any]]]: + ) -> tuple[list[Optimizer], list[dict[str, Any]]]: return None # type: ignore -def test_calculate_loss(): +def test_calculate_loss() -> None: # Given implementation = MockImplementation( OptimizerConfig(), lambda x, y: Tensor([0]) diff --git a/test/wsmf/metamodels/train/test_reconstruction.py b/test/wsmf/metamodels/train/test_reconstruction.py index 8237a8e..c8ff27c 100644 --- a/test/wsmf/metamodels/train/test_reconstruction.py +++ b/test/wsmf/metamodels/train/test_reconstruction.py @@ -1,8 +1,8 @@ from typing import Any, Callable -import torch from dataset2vec.config import OptimizerConfig from torch import Tensor, rand +from torch.optim.optimizer import Optimizer from wsmf.metamodels.train import LandmarkerReconstructionTrainingInterface @@ -23,13 +23,13 @@ def __init__( def forward(self, X: Tensor, y: Tensor) -> Tensor: return next(self.mock_encodings_generator) - def configure_optimizers( + def configure_optimizers( # type: ignore self, - ) -> tuple[list[torch.optim.Optimizer], list[dict[str, Any]]]: + ) -> tuple[list[Optimizer], list[dict[str, Any]]]: return None # type: ignore -def test_extract_labels_and_similarities_from_batch(): +def test_extract_labels_and_similarities_from_batch() -> None: # Given batch = [ (rand(10, 5), rand(10, 1), Tensor([4, 5, 6])), diff --git a/test/wsmf/selectors/test_baselines.py b/test/wsmf/selectors/test_baselines.py new file mode 100644 index 0000000..2bc956d --- /dev/null +++ b/test/wsmf/selectors/test_baselines.py @@ -0,0 +1,106 @@ +from unittest.mock import Mock, patch + +import numpy as np +from torch import Tensor, rand + +from wsmf.selectors import ( + LandmarkerHpSelector, + RandomHpSelector, + RankBasedHpSelector, +) + + +@patch("numpy.random.choice") +def test_random_hp_selector(choice_mock: Mock) -> None: + # Given + choice_mock.return_value = np.array([0, 2]) + selector = RandomHpSelector( + { + "dataset1": (rand((5, 3)), rand((5, 1))), + "dataset2": (rand((10, 2)), rand((10, 1))), + }, + { + "dataset1": Tensor([1, 2, 3]), + "dataset2": Tensor([2, 3, 4]), + }, + [ + {"hparam1": 1}, + {"hparam2": 2}, + {"hparam3": 3}, + ], + ) + + # When + proposed_configurations = selector.propose_configurations( + (rand((4, 3)), rand((4, 1))), 2 + ) + + # Then + assert proposed_configurations == [ + {"hparam1": 1}, + {"hparam3": 3}, + ] + + +def test_rank_based_hp_selector() -> None: + # Given + selector = RankBasedHpSelector( + { + "dataset1": (rand((5, 3)), rand((5, 1))), + "dataset2": (rand((10, 2)), rand((10, 1))), + "dataset3": (rand((10, 2)), rand((10, 1))), + }, + { + "dataset1": Tensor([1, 2, 3]), + "dataset2": Tensor([2, 3, 2]), + "dataset3": Tensor([3, 4, 1]), + }, + [ + {"hparam1": 1}, + {"hparam2": 2}, + {"hparam3": 3}, + ], + ) + + # When + proposed_configurations = selector.propose_configurations( + (rand((4, 3)), rand((4, 1))), 2 + ) + + # Then + assert proposed_configurations == [ + {"hparam2": 2}, + {"hparam3": 3}, + ] + + +def test_landmarker_hp_selector() -> None: + # Given + selector = LandmarkerHpSelector( + { + "dataset1": (rand((5, 3)), rand((5, 1))), + "dataset2": (rand((10, 2)), rand((10, 1))), + "dataset3": (rand((10, 2)), rand((10, 1))), + }, + { + "dataset1": Tensor([0.03, 0.01, 0.01]), + "dataset2": Tensor([0.2, 0.3, 0.2]), + "dataset3": Tensor([0.4, 0.5, 0.6]), + }, + [ + {"hparam1": 1}, + {"hparam2": 2}, + {"hparam3": 3}, + ], + ) + + # When + proposed_configurations = selector.propose_configurations( + Tensor([0.35, 0.35, 0.35]), 2 # type: ignore + ) + + # Then + assert proposed_configurations == [ + {"hparam2": 2}, + {"hparam3": 3}, + ] diff --git a/test/wsmf/selectors/test_reconstruction_based.py b/test/wsmf/selectors/test_reconstruction_based.py new file mode 100644 index 0000000..ae3af0b --- /dev/null +++ b/test/wsmf/selectors/test_reconstruction_based.py @@ -0,0 +1,38 @@ +from unittest.mock import Mock + +from torch import Tensor, rand + +from wsmf.selectors import ReconstructionBasedHpSelector + + +def test_reconstruction_based_hp_selector() -> None: + # Given + selector = ReconstructionBasedHpSelector( + Mock(return_value=Tensor([0.35, 0.35, 0.35])), + { + "dataset1": (rand((5, 3)), rand((5, 1))), + "dataset2": (rand((10, 2)), rand((10, 1))), + "dataset3": (rand((10, 2)), rand((10, 1))), + }, + { + "dataset1": Tensor([0.03, 0.01, 0.01]), + "dataset2": Tensor([0.2, 0.3, 0.2]), + "dataset3": Tensor([0.4, 0.5, 0.6]), + }, + [ + {"hparam1": 1}, + {"hparam2": 2}, + {"hparam3": 3}, + ], + ) + + # When + proposed_configurations = selector.propose_configurations( + (rand((5, 3)), rand((5, 1))), 2 + ) + + # Then + assert proposed_configurations == [ + {"hparam2": 2}, + {"hparam3": 3}, + ] diff --git a/test/wsmf/selectors/test_representation_based.py b/test/wsmf/selectors/test_representation_based.py new file mode 100644 index 0000000..afb5e7a --- /dev/null +++ b/test/wsmf/selectors/test_representation_based.py @@ -0,0 +1,46 @@ +from unittest.mock import Mock + +from torch import Tensor, rand + +from wsmf.selectors import RepresentationBasedHpSelector + + +def test_representation_based_hp_selector() -> None: + # Given + mock_representations_iterator = iter( + [ + Tensor([0.03, 0.01, 0.01]), + Tensor([0.2, 0.3, 0.2]), + Tensor([0.2, 0.3, 0.2]), + Tensor([0.35, 0.35, 0.35]), + ] + ) + selector = RepresentationBasedHpSelector( + Mock(side_effect=lambda x, y: next(mock_representations_iterator)), + { + "dataset1": (rand((5, 3)), rand((5, 1))), + "dataset2": (rand((10, 2)), rand((10, 1))), + "dataset3": (rand((10, 2)), rand((10, 1))), + }, + { + "dataset1": Tensor([11, 2, 3]), + "dataset2": Tensor([3, 10, 12]), + "dataset3": Tensor([6, 9, 8]), + }, + [ + {"hparam1": 1}, + {"hparam2": 2}, + {"hparam3": 3}, + ], + ) + + # When + proposed_configurations = selector.propose_configurations( + (rand((5, 3)), rand((5, 1))), 2 + ) + + # Then + assert proposed_configurations == [ + {"hparam3": 3}, + {"hparam2": 2}, + ] diff --git a/test/wsmf/selectors/test_selector.py b/test/wsmf/selectors/test_selector.py new file mode 100644 index 0000000..9a2b5b5 --- /dev/null +++ b/test/wsmf/selectors/test_selector.py @@ -0,0 +1,46 @@ +from typing import Tuple + +from torch import Tensor, rand + +from wsmf.selectors.selector import WarmstartHpSelector + + +class MockSelector(WarmstartHpSelector): + + def propose_configurations_idx( + self, dataset: Tuple[Tensor, Tensor], n_configurations: int + ) -> list[int]: + return [2, 0, 1] + + +def test_propose_configurations() -> None: + # Given + selector = MockSelector( + { + "dataset1": (rand((5, 3)), rand((5, 1))), + "dataset2": (rand((10, 2)), rand((10, 1))), + "dataset3": (rand((10, 2)), rand((10, 1))), + }, + { + "dataset1": Tensor([11, 2, 3]), + "dataset2": Tensor([3, 10, 12]), + "dataset3": Tensor([6, 9, 8]), + }, + [ + {"hparam1": 1}, + {"hparam2": 2}, + {"hparam3": 3}, + ], + ) + + # When + proposed_configurations = selector.propose_configurations( + (rand((5, 3)), rand((5, 1))), 3 + ) + + # Then + assert proposed_configurations == [ + {"hparam3": 3}, + {"hparam1": 1}, + {"hparam2": 2}, + ] diff --git a/test/wsmf/selectors/test_utils.py b/test/wsmf/selectors/test_utils.py new file mode 100644 index 0000000..15071d7 --- /dev/null +++ b/test/wsmf/selectors/test_utils.py @@ -0,0 +1,21 @@ +import numpy as np + +from wsmf.selectors.utils import get_ranks_of_hp_configurations + + +def test_get_ranks_of_hp_configurations() -> None: + # Given + performance_matrix = np.array( + [ + [0.5, 0.6, 0.5, 0.7], + [0.5, 0.6, 0.7, 0.6], + [0.5, 0.7, 0.5, 0.7], + [0.4, 0.6, 0.5, 0.4], + ] + ) + + # When + actual_idx = get_ranks_of_hp_configurations(performance_matrix) + + # Then + assert actual_idx == [1, 3, 2, 0] diff --git a/wsmf/__init__.py b/wsmf/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/wsmf/metamodels/data/__init__.py b/wsmf/metamodels/data/__init__.py index 855bc23..e3c85e5 100644 --- a/wsmf/metamodels/data/__init__.py +++ b/wsmf/metamodels/data/__init__.py @@ -1,7 +1,9 @@ -from .dataset import EncoderHpoDataset -from .landmarker_reconstruction import LandmarkerReconstructionLoader -from .metric_loader import EncoderMetricLearningLoader -from .repeatable import GenericRepeatableD2vLoader +from wsmf.metamodels.data.dataset import EncoderHpoDataset +from wsmf.metamodels.data.landmarker_reconstruction import ( + LandmarkerReconstructionLoader, +) +from wsmf.metamodels.data.metric_loader import EncoderMetricLearningLoader +from wsmf.metamodels.data.repeatable import GenericRepeatableD2vLoader __all__ = [ "EncoderHpoDataset", diff --git a/wsmf/metamodels/data/dataset.py b/wsmf/metamodels/data/dataset.py index d35b386..3907134 100644 --- a/wsmf/metamodels/data/dataset.py +++ b/wsmf/metamodels/data/dataset.py @@ -4,7 +4,7 @@ from torch.utils.data import Dataset -class EncoderHpoDataset(Dataset): +class EncoderHpoDataset(Dataset): # type: ignore """ Dataset class for encoding HPO data. @@ -50,7 +50,7 @@ def __init__( def __len__(self) -> int: return len(self.datasets) - def __getitem__(self, dataset_name) -> Tuple[Tensor, Tensor, Tensor]: + def __getitem__(self, dataset_name: str) -> Tuple[Tensor, Tensor, Tensor]: return ( *self.datasets[dataset_name], self.hp_landmarkers[dataset_name], diff --git a/wsmf/metamodels/data/landmarker_reconstruction.py b/wsmf/metamodels/data/landmarker_reconstruction.py index 340c0c7..79b6cc9 100644 --- a/wsmf/metamodels/data/landmarker_reconstruction.py +++ b/wsmf/metamodels/data/landmarker_reconstruction.py @@ -1,12 +1,12 @@ from __future__ import annotations from copy import deepcopy -from typing import Tuple +from typing import Any, Tuple import numpy as np from torch import Tensor -from .dataset import EncoderHpoDataset +from wsmf.metamodels.data.dataset import EncoderHpoDataset class LandmarkerReconstructionLoader: @@ -72,7 +72,9 @@ def __init__( def __next__( self, - ) -> list[Tuple[Tensor, Tensor, Tensor] | Tuple[Tensor, Tensor, dict]]: + ) -> list[ + Tuple[Tensor, Tensor, Tensor] | Tuple[Tensor, Tensor, dict[str, Any]] + ]: start_index = self.batch_counter * self.batch_size end_index = ( start_index + self.batch_size @@ -92,10 +94,12 @@ def __iter__(self) -> LandmarkerReconstructionLoader: self.sample_indices = np.random.permutation(self.n_datasets) return deepcopy(self) - def __len__(self): + def __len__(self) -> int: return self.n_datasets // self.batch_size + 1 - def __generate_sample(self, dataset_idx) -> Tuple[Tensor, Tensor, Tensor]: + def __generate_sample( + self, dataset_idx: int + ) -> Tuple[Tensor, Tensor, Tensor]: dataset_name = self.dataset_names[dataset_idx] return self.dataset[dataset_name] diff --git a/wsmf/metamodels/data/metric_loader.py b/wsmf/metamodels/data/metric_loader.py index d8e78d9..7f7c5b9 100644 --- a/wsmf/metamodels/data/metric_loader.py +++ b/wsmf/metamodels/data/metric_loader.py @@ -6,7 +6,7 @@ import numpy as np from torch import Tensor -from .dataset import EncoderHpoDataset +from wsmf.metamodels.data.dataset import EncoderHpoDataset class EncoderMetricLearningLoader: @@ -68,7 +68,7 @@ def __next__(self) -> list[Tuple[Tensor, Tensor, Tensor, Tensor, float]]: def __iter__(self) -> EncoderMetricLearningLoader: return deepcopy(self) - def __len__(self): + def __len__(self) -> int: return self.n_batches def __generate_sample( @@ -96,4 +96,4 @@ def __generate_sample( def __calculcate_landmarkers_similarity( self, landmarkers1: Tensor, landmarkers2: Tensor ) -> float: - return ((landmarkers1 - landmarkers2) ** 2).mean().item() + return ((landmarkers1 - landmarkers2) ** 2).mean().item() # type: ignore # noqa E501 diff --git a/wsmf/metamodels/data/repeatable.py b/wsmf/metamodels/data/repeatable.py index fd22027..0bc8af3 100644 --- a/wsmf/metamodels/data/repeatable.py +++ b/wsmf/metamodels/data/repeatable.py @@ -1,23 +1,24 @@ +from __future__ import annotations + from copy import deepcopy -from typing import Iterable +from typing import Any, Iterable class GenericRepeatableD2vLoader: - def __init__(self, loader: Iterable): + def __init__(self, loader: Iterable[Any]): self.batches = list(loader) self.released_batches_count = 0 - def __next__(self): + def __next__(self) -> Any: if self.released_batches_count == len(self.batches): raise StopIteration() batch = self.batches[self.released_batches_count] self.released_batches_count += 1 return batch - def __iter__(self): + def __iter__(self) -> GenericRepeatableD2vLoader: return deepcopy(self) - def __len__(self): - + def __len__(self) -> int: return len(self.batches) diff --git a/wsmf/metamodels/networks/__init__.py b/wsmf/metamodels/networks/__init__.py index 373057e..d5f00c5 100644 --- a/wsmf/metamodels/networks/__init__.py +++ b/wsmf/metamodels/networks/__init__.py @@ -1,5 +1,7 @@ -from .d2v_metric import Dataset2VecMetricLearning -from .d2v_reconstruction import Dataset2VecForLandmarkerReconstruction +from wsmf.metamodels.networks.d2v_metric import Dataset2VecMetricLearning +from wsmf.metamodels.networks.d2v_reconstruction import ( + Dataset2VecForLandmarkerReconstruction, +) __all__ = [ "Dataset2VecMetricLearning", diff --git a/wsmf/metamodels/networks/d2v_metric.py b/wsmf/metamodels/networks/d2v_metric.py index 89e4477..cff6627 100644 --- a/wsmf/metamodels/networks/d2v_metric.py +++ b/wsmf/metamodels/networks/d2v_metric.py @@ -7,6 +7,7 @@ from dataset2vec import Dataset2Vec from dataset2vec.config import Dataset2VecConfig, OptimizerConfig from torch import Tensor +from torch.optim.optimizer import Optimizer from wsmf.metamodels.train.metric import MetricLearningTrainingInterface @@ -46,15 +47,15 @@ def __init__( self.dataset2vec = Dataset2Vec(config, optimizer_config) def forward(self, X: Tensor, y: Tensor) -> Tensor: - return self.dataset2vec(X, y) + return self.dataset2vec(X, y) # type: ignore - def configure_optimizers( + def configure_optimizers( # type: ignore self, - ) -> tuple[list[torch.optim.Optimizer], list[dict[str, Any]]]: - optimizer = self.optimizer_config.optimizer_cls( # type: ignore + ) -> tuple[list[Optimizer], list[dict[str, Any]]]: + optimizer = self.optimizer_config.optimizer_cls( self.parameters(), - lr=self.optimizer_config.learning_rate, # type: ignore - weight_decay=self.optimizer_config.weight_decay, # type: ignore + lr=self.optimizer_config.learning_rate, + weight_decay=self.optimizer_config.weight_decay, ) scheduler = torch.optim.lr_scheduler.LinearLR(optimizer) diff --git a/wsmf/metamodels/networks/d2v_reconstruction.py b/wsmf/metamodels/networks/d2v_reconstruction.py index f82e56a..af5dd11 100644 --- a/wsmf/metamodels/networks/d2v_reconstruction.py +++ b/wsmf/metamodels/networks/d2v_reconstruction.py @@ -4,8 +4,9 @@ from dataset2vec import Dataset2Vec from dataset2vec.config import Dataset2VecConfig, OptimizerConfig -from torch import Tensor, nn, optim +from torch import Tensor, nn from torch.optim.lr_scheduler import ReduceLROnPlateau +from torch.optim.optimizer import Optimizer from wsmf.metamodels.train.reconstruction import ( LandmarkerReconstructionTrainingInterface, @@ -66,13 +67,13 @@ def forward(self, X: Tensor, y: Tensor) -> Any: dataset_representation = self.dataset2vec(X, y) return self.landmarker_reconstructor(dataset_representation) - def configure_optimizers( + def configure_optimizers( # type: ignore self, - ) -> tuple[list[optim.Optimizer], list[dict[str, Any]]]: - optimizer = self.optimizer_config.optimizer_cls( # type: ignore + ) -> tuple[list[Optimizer], list[dict[str, Any]]]: + optimizer = self.optimizer_config.optimizer_cls( self.parameters(), - lr=self.optimizer_config.learning_rate, # type: ignore - weight_decay=self.optimizer_config.weight_decay, # type: ignore + lr=self.optimizer_config.learning_rate, + weight_decay=self.optimizer_config.weight_decay, ) scheduler = ReduceLROnPlateau(optimizer, patience=20, factor=0.1) @@ -107,4 +108,4 @@ def landmarkers_reconstruction_loss( Mean squared error loss between true and predicted landmarkers. """ labels = true_landmarkers.to(predicted_landmarkers.device) - return ((predicted_landmarkers - labels) ** 2).mean(dim=1).mean() + return ((predicted_landmarkers - labels) ** 2).mean(dim=1).mean() # type: ignore # noqa: E501 diff --git a/wsmf/metamodels/train/__init__.py b/wsmf/metamodels/train/__init__.py index dfa6db6..3c0a4dc 100644 --- a/wsmf/metamodels/train/__init__.py +++ b/wsmf/metamodels/train/__init__.py @@ -1,5 +1,7 @@ -from .metric import MetricLearningTrainingInterface -from .reconstruction import LandmarkerReconstructionTrainingInterface +from wsmf.metamodels.train.metric import MetricLearningTrainingInterface +from wsmf.metamodels.train.reconstruction import ( + LandmarkerReconstructionTrainingInterface, +) __all__ = [ "MetricLearningTrainingInterface", diff --git a/wsmf/metamodels/train/interface.py b/wsmf/metamodels/train/interface.py index 46a9a1b..36c90e7 100644 --- a/wsmf/metamodels/train/interface.py +++ b/wsmf/metamodels/train/interface.py @@ -5,6 +5,7 @@ import torch from dataset2vec.config import OptimizerConfig from torch import Tensor, stack +from torch.optim.optimizer import Optimizer class TrainingInterface(pl.LightningModule, ABC): @@ -20,7 +21,7 @@ def __init__( self.save_hyperparameters() @abstractmethod - def forward(self, *args, **kwargs) -> Tensor: + def forward(self, *args, **kwargs) -> Tensor: # type: ignore pass @abstractmethod @@ -30,9 +31,9 @@ def calculate_datasets_similarity( pass @abstractmethod - def configure_optimizers( + def configure_optimizers( # type: ignore self, - ) -> tuple[list[torch.optim.Optimizer], list[dict[str, Any]]]: + ) -> tuple[list[Optimizer], list[dict[str, Any]]]: pass # Training phase diff --git a/wsmf/metamodels/train/metric.py b/wsmf/metamodels/train/metric.py index 78e7a07..f8bae5b 100644 --- a/wsmf/metamodels/train/metric.py +++ b/wsmf/metamodels/train/metric.py @@ -13,4 +13,4 @@ def calculate_datasets_similarity( ) -> Tensor: emb1 = self(X1, y1) emb2 = self(X2, y2) - return ((emb1 - emb2) ** 2).mean() + return ((emb1 - emb2) ** 2).mean() # type: ignore diff --git a/wsmf/selectors/__init__.py b/wsmf/selectors/__init__.py new file mode 100644 index 0000000..750185e --- /dev/null +++ b/wsmf/selectors/__init__.py @@ -0,0 +1,15 @@ +from .baselines import ( + LandmarkerHpSelector, + RandomHpSelector, + RankBasedHpSelector, +) +from .reconstruction_based import ReconstructionBasedHpSelector +from .representation_based import RepresentationBasedHpSelector + +__all__ = [ + "LandmarkerHpSelector", + "RandomHpSelector", + "RankBasedHpSelector", + "ReconstructionBasedHpSelector", + "RepresentationBasedHpSelector", +] diff --git a/wsmf/selectors/baselines.py b/wsmf/selectors/baselines.py index e5c975a..d811107 100644 --- a/wsmf/selectors/baselines.py +++ b/wsmf/selectors/baselines.py @@ -1,12 +1,11 @@ -from typing import Literal, Tuple +from typing import Any, Tuple import numpy as np import torch from torch import Tensor -from engine.portfolio_selection import get_ranks_of_hp_configurations - from .selector import WarmstartHpSelector +from .utils import get_ranks_of_hp_configurations class RandomHpSelector(WarmstartHpSelector): @@ -14,14 +13,11 @@ class RandomHpSelector(WarmstartHpSelector): def propose_configurations_idx( self, dataset: Tuple[Tensor, Tensor], n_configurations: int ) -> list[int]: - return np.random.choice( - len(self.configurations), size=n_configurations, replace=False - ).tolist() - - def propose_configuration_idx_asmfo( - self, dataset: Tuple[Tensor, Tensor], n_configurations: int - ) -> list[int]: - return self.propose_configurations_idx(dataset, n_configurations) + return list( + np.random.choice( + len(self.configurations), size=n_configurations, replace=False + ) + ) class RankBasedHpSelector(WarmstartHpSelector): @@ -30,10 +26,9 @@ def __init__( self, metadataset: dict[str, Tuple[Tensor, Tensor]], landmarkers: dict[str, Tensor], - configurations: list[dict], - algorithm: Literal["greedy", "asmfo"], + configurations: list[dict[str, Any]], ): - super().__init__(metadataset, landmarkers, configurations, algorithm) + super().__init__(metadataset, landmarkers, configurations) self.ranks = get_ranks_of_hp_configurations( np.stack( [landmarker.cpu().numpy() for landmarker in self.landmarkers] @@ -45,11 +40,6 @@ def propose_configurations_idx( ) -> list[int]: return self.ranks[:n_configurations] - def propose_configuration_idx_asmfo( - self, dataset: Tuple[Tensor, Tensor], n_configurations: int - ) -> list[int]: - return self.propose_configurations_idx(dataset, n_configurations) - class LandmarkerHpSelector(WarmstartHpSelector): @@ -57,13 +47,12 @@ def __init__( self, metadataset: dict[str, Tuple[Tensor, Tensor]], landmarkers: dict[str, Tensor], - configurations: list[dict], - algorithm: Literal["greedy", "asmfo"] = "greedy", + configurations: list[dict[str, Any]], ): - super().__init__(metadataset, landmarkers, configurations, algorithm) + super().__init__(metadataset, landmarkers, configurations) def propose_configurations_idx( - self, landmarkers: Tensor, n_configurations: int + self, landmarkers: Tensor, n_configurations: int # type: ignore ) -> list[int]: distances = np.array( [ @@ -74,29 +63,8 @@ def propose_configurations_idx( closest_landmarkers_idx = np.argpartition(distances, n_configurations)[ :n_configurations ].tolist() + return [ self.best_configurations_idx[idx] for idx in closest_landmarkers_idx ] - - def propose_configuration_idx_asmfo( - self, landmarkers: Tensor, n_configurations: int - ) -> list[int]: - distances = np.array( - [ - torch.norm(landmarkers - landmarkers_from_db).cpu().numpy() - for landmarkers_from_db in self.landmarkers - ] - ) - closest_landmarkers_idx = np.argpartition(distances, n_configurations)[ - :n_configurations - ].tolist() - closest_landmarkers = torch.stack( - [self.landmarkers[idx] for idx in closest_landmarkers_idx] - ) - ranks_of_configurations = get_ranks_of_hp_configurations( - closest_landmarkers.cpu().numpy() - ) - return np.argpartition(ranks_of_configurations, n_configurations)[ - :n_configurations - ].tolist() diff --git a/wsmf/selectors/reconstruction.py b/wsmf/selectors/reconstruction.py deleted file mode 100644 index 510f915..0000000 --- a/wsmf/selectors/reconstruction.py +++ /dev/null @@ -1,128 +0,0 @@ -from pathlib import Path -from typing import Literal, Tuple - -import numpy as np -import torch -from torch import Tensor - -from engine.metamodels.networks.d2v_reconstruction import ( - Dataset2VecForLandmarkerReconstruction, -) -from engine.portfolio_selection import get_ranks_of_hp_configurations - -from .selector import WarmstartHpSelector - - -class Dataset2VecReconstructionHpSelector(WarmstartHpSelector): - - def __init__( - self, - encoder_path: Path, - metadataset: dict[str, Tuple[Tensor, Tensor]], - landmarkers: dict[str, Tensor], - configurations: list[dict], - algorithm: Literal["greedy", "asmfo"] = "greedy", - ): - super().__init__(metadataset, landmarkers, configurations, algorithm) - self.encoder = ( - Dataset2VecForLandmarkerReconstruction.load_from_checkpoint( - encoder_path - ) - ) - - @torch.no_grad() - def propose_configurations_idx( - self, dataset: Tuple[Tensor, Tensor], n_configurations: int - ) -> list[int]: - predicted_landmarkers = self.encoder(*dataset) - distances = np.array( - [ - torch.norm(predicted_landmarkers - landmarker_from_db) - .cpu() - .numpy() - for landmarker_from_db in self.landmarkers - ] - ) - closest_datasets_idx = np.argpartition(distances, n_configurations)[ - :n_configurations - ].tolist() - return [ - self.best_configurations_idx[idx] for idx in closest_datasets_idx - ] - - @torch.no_grad() - def propose_configuration_idx_asmfo( - self, dataset: Tuple[Tensor, Tensor], n_configurations: int - ) -> list[int]: - predicted_landmarkers = self.encoder(*dataset) - distances = np.array( - [ - torch.norm(predicted_landmarkers - landmarker_from_db) - .cpu() - .numpy() - for landmarker_from_db in self.landmarkers - ] - ) - closest_datasets_idx = np.argpartition(distances, n_configurations)[ - :n_configurations - ].tolist() - closest_datasets = torch.stack( - [self.landmarkers[idx] for idx in closest_datasets_idx] - ) - ranks_of_configurations = get_ranks_of_hp_configurations( - closest_datasets.cpu().numpy() - ) - return np.argpartition(ranks_of_configurations, n_configurations)[ - :n_configurations - ].tolist() - - -class Dataset2VecReconstructionHpSelectorMixedDistances(WarmstartHpSelector): - - def __init__( - self, - encoder_path: Path, - metadataset: dict[str, Tuple[Tensor, Tensor]], - landmarkers: dict[str, Tensor], - configurations: list[dict], - algorithm: Literal["greedy", "asmfo"] = "greedy", - n_closest: int = 3, - n_furthest: int = 2, - ): - super().__init__(metadataset, landmarkers, configurations, algorithm) - self.encoder = ( - Dataset2VecForLandmarkerReconstruction.load_from_checkpoint( - encoder_path - ) - ) - self.n_closest = n_closest - self.n_furthest = n_furthest - - @torch.no_grad() - def propose_configurations_idx( - self, dataset: Tuple[Tensor, Tensor], n_configurations: int - ) -> list[int]: - predicted_landmarkers = self.encoder(*dataset) - distances = np.array( - [ - torch.norm(predicted_landmarkers - encoding_from_db) - .cpu() - .numpy() - for encoding_from_db in self.landmarkers - ] - ) - closest_datasets_idx = np.argpartition(distances, n_configurations)[ - : self.n_closest - ].tolist() - furthest_datasets_idx = np.argpartition(-distances, n_configurations)[ - : self.n_furthest - ].tolist() - return [ - self.best_configurations_idx[idx] - for idx in closest_datasets_idx + furthest_datasets_idx - ] - - def propose_configuration_idx_asmfo( - self, dataset: Tuple[Tensor, Tensor], n_configurations: int - ) -> list[int]: - raise NotImplementedError() diff --git a/wsmf/selectors/reconstruction_based.py b/wsmf/selectors/reconstruction_based.py new file mode 100644 index 0000000..bd54bfd --- /dev/null +++ b/wsmf/selectors/reconstruction_based.py @@ -0,0 +1,42 @@ +from typing import Any, Tuple + +import numpy as np +import torch +from torch import Tensor + +from wsmf.metamodels.train import LandmarkerReconstructionTrainingInterface + +from .selector import WarmstartHpSelector + + +class ReconstructionBasedHpSelector(WarmstartHpSelector): + + def __init__( + self, + encoder: LandmarkerReconstructionTrainingInterface, + metadataset: dict[str, Tuple[Tensor, Tensor]], + landmarkers: dict[str, Tensor], + configurations: list[dict[str, Any]], + ): + super().__init__(metadataset, landmarkers, configurations) + self.encoder = encoder + + @torch.no_grad() + def propose_configurations_idx( + self, dataset: Tuple[Tensor, Tensor], n_configurations: int + ) -> list[int]: + predicted_landmarkers = self.encoder(*dataset) + distances = np.array( + [ + torch.norm(predicted_landmarkers - landmarker_from_db) + .cpu() + .numpy() + for landmarker_from_db in self.landmarkers + ] + ) + closest_datasets_idx = np.argpartition(distances, n_configurations)[ + :n_configurations + ].tolist() + return [ + self.best_configurations_idx[idx] for idx in closest_datasets_idx + ] diff --git a/wsmf/selectors/representation_based.py b/wsmf/selectors/representation_based.py index b424fd3..3b0a8ca 100644 --- a/wsmf/selectors/representation_based.py +++ b/wsmf/selectors/representation_based.py @@ -1,37 +1,34 @@ -from pathlib import Path -from typing import Literal, Tuple +from typing import Any, Tuple import numpy as np import torch from dataset2vec.model import Dataset2Vec from torch import Tensor -from experiments_engine.metamodels.networks.d2v_new_loss import ( - Dataset2VecForHpo, -) -from experiments_engine.portfolio_selection import ( - get_ranks_of_hp_configurations, +from wsmf.metamodels.train import ( + LandmarkerReconstructionTrainingInterface, + MetricLearningTrainingInterface, ) from .selector import WarmstartHpSelector -class Dataset2VecHpSelector(WarmstartHpSelector): +class RepresentationBasedHpSelector(WarmstartHpSelector): @torch.no_grad() def __init__( self, - encoder_path: Path, + encoder: ( + Dataset2Vec + | MetricLearningTrainingInterface + | LandmarkerReconstructionTrainingInterface + ), metadataset: dict[str, Tuple[Tensor, Tensor]], landmarkers: dict[str, Tensor], - configurations: list[dict], - algorithm: Literal["greedy", "asmfo"] = "greedy", + configurations: list[dict[str, Any]], ): - super().__init__(metadataset, landmarkers, configurations, algorithm) - try: - self.encoder = Dataset2Vec.load_from_checkpoint(encoder_path) - except: - self.encoder = Dataset2VecForHpo.load_from_checkpoint(encoder_path) + super().__init__(metadataset, landmarkers, configurations) + self.encoder = encoder self.encodings = [ self.encoder(*dataset_from_db) for dataset_from_db in self.datasets ] @@ -53,71 +50,3 @@ def propose_configurations_idx( return [ self.best_configurations_idx[idx] for idx in closest_datasets_idx ] - - @torch.no_grad() - def propose_configuration_idx_asmfo( - self, dataset: Tuple[Tensor, Tensor], n_configurations: int - ) -> list[int]: - dataset_encoding = self.encoder(*dataset) - distances = np.array( - [ - torch.norm(dataset_encoding - encoding_from_db).cpu().numpy() - for encoding_from_db in self.encodings - ] - ) - closest_datasets_idx = np.argpartition(distances, n_configurations)[ - :n_configurations - ].tolist() - closest_datasets = torch.stack( - [self.landmarkers[idx] for idx in closest_datasets_idx] - ) - ranks_of_configurations = get_ranks_of_hp_configurations( - closest_datasets.cpu().numpy() - ) - return np.argpartition(ranks_of_configurations, n_configurations)[ - :n_configurations - ].tolist() - - -class Dataset2VecHpSelectorMixedDistances(WarmstartHpSelector): - - @torch.no_grad() - def __init__( - self, - encoder_path: Path, - metadataset: dict[str, Tuple[Tensor, Tensor]], - landmarkers: dict[str, Tensor], - configurations: list[dict], - algorithm: Literal["greedy", "asmfo"] = "greedy", - n_closest: int = 3, - n_furthest: int = 2, - ): - super().__init__(metadataset, landmarkers, configurations, algorithm) - self.encoder = Dataset2Vec.load_from_checkpoint(encoder_path) - self.encodings = [ - self.encoder(*dataset_from_db) for dataset_from_db in self.datasets - ] - self.n_closest = n_closest - self.n_furthest = n_furthest - - @torch.no_grad() - def propose_configurations_idx( - self, dataset: Tuple[Tensor, Tensor], n_configurations: int - ) -> list[int]: - dataset_encoding = self.encoder(*dataset) - distances = np.array( - [ - torch.norm(dataset_encoding - encoding_from_db).cpu().numpy() - for encoding_from_db in self.encodings - ] - ) - closest_datasets_idx = np.argpartition(distances, n_configurations)[ - : self.n_closest - ].tolist() - furthest_datasets_idx = np.argpartition(-distances, n_configurations)[ - : self.n_furthest - ].tolist() - return [ - self.best_configurations_idx[idx] - for idx in closest_datasets_idx + furthest_datasets_idx - ] diff --git a/wsmf/selectors/selector.py b/wsmf/selectors/selector.py index e744a04..c7f805c 100644 --- a/wsmf/selectors/selector.py +++ b/wsmf/selectors/selector.py @@ -1,5 +1,5 @@ from abc import ABC, abstractmethod -from typing import Literal, Tuple +from typing import Any, Tuple import numpy as np from torch import Tensor @@ -11,13 +11,11 @@ def __init__( self, metadataset: dict[str, Tuple[Tensor, Tensor]], landmarkers: dict[str, Tensor], - configurations: list[dict], - algorithm: Literal["greedy", "asmfo"] = "greedy", + configurations: list[dict[str, Any]], ): self.landmarkers_orig = landmarkers self.configurations = configurations self.metadataset = metadataset - self.algorithm = algorithm self.datasets_names = list(sorted(metadataset.keys())) self.best_configurations_idx = [ @@ -33,13 +31,8 @@ def __init__( def propose_configurations( self, dataset: Tuple[Tensor, Tensor], n_configurations: int - ) -> list[dict]: - if self.algorithm == "greedy": - idx = self.propose_configurations_idx(dataset, n_configurations) - elif self.algorithm == "asmfo": - idx = self.propose_configuration_idx_asmfo( - dataset, n_configurations - ) + ) -> list[dict[str, Any]]: + idx = self.propose_configurations_idx(dataset, n_configurations) return [self.configurations[i] for i in idx] @abstractmethod @@ -47,9 +40,3 @@ def propose_configurations_idx( self, dataset: Tuple[Tensor, Tensor], n_configurations: int ) -> list[int]: pass - - @abstractmethod - def propose_configuration_idx_asmfo( - self, dataset: Tuple[Tensor, Tensor], n_configurations: int - ) -> list[int]: - pass diff --git a/wsmf/selectors/utils.py b/wsmf/selectors/utils.py new file mode 100644 index 0000000..cacc775 --- /dev/null +++ b/wsmf/selectors/utils.py @@ -0,0 +1,14 @@ +from typing import Any + +import numpy as np +from numpy.typing import NDArray +from scipy.stats import rankdata + + +def get_ranks_of_hp_configurations(hp_performances: NDArray[Any]) -> list[int]: + ranks_per_dataset = np.array( + [rankdata(-row, method="dense") for row in hp_performances] + ) + average_ranks_per_configuration = ranks_per_dataset.mean(axis=0) + final_ranks = np.argsort(average_ranks_per_configuration) + return list(final_ranks)