-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
0 parents
commit 8a61358
Showing
47 changed files
with
3,449 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
**__pycache__** | ||
*.db | ||
errors | ||
.vscode/ | ||
results | ||
d2v* | ||
!d2v*.py | ||
.venv/ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,10 @@ | ||
## Running code | ||
### Setup | ||
``` | ||
pip install -r requirements.txt | ||
export PYTHONPATH=`pwd` | ||
``` | ||
### Loading data | ||
``` | ||
python bin/load_data.py | ||
``` |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,115 @@ | ||
import argparse | ||
import json | ||
import warnings | ||
|
||
import pandas as pd | ||
import pytorch_lightning as pl | ||
import yaml | ||
from loguru import logger | ||
from optuna import samplers | ||
from torch import Tensor | ||
from tqdm import tqdm | ||
|
||
import experiments_engine.hpo as hpo_cls_pkg | ||
from experiments_engine.hp_selectors.baselines import LandmarkerHpSelector | ||
from experiments_engine.hp_selectors.factory import ( | ||
SelectorsFactory, | ||
get_hp_selector_from_path, | ||
) | ||
from experiments_engine.paths import paths_provider | ||
from experiments_engine.utils import extract_dataset_name_from_path | ||
from experiments_engine.warmstart_utils import ( | ||
get_hpo_task_from_path, | ||
perform_ground_truth_warm_start_experiment, | ||
perform_warm_start_experiment, | ||
) | ||
|
||
warnings.simplefilter("ignore") | ||
|
||
|
||
def main(): | ||
pl.seed_everything(123) | ||
logger.info("Parsing shell args") | ||
parser = argparse.ArgumentParser() | ||
parser.add_argument("--objective", type=str) | ||
parser.add_argument("--model-name", type=str) | ||
parser.add_argument("--sampler-name", type=str) | ||
args = parser.parse_args() | ||
with open(paths_provider.hp_selectors_path, "r") as f: | ||
config = yaml.load(f, yaml.CLoader) | ||
objective_cls = getattr(hpo_cls_pkg, args.objective) | ||
sampler_cls = getattr(samplers, args.sampler_name) | ||
|
||
logger.info("Initializing configurations selectors") | ||
selectors = [ | ||
( | ||
config_entry.pop("name"), | ||
SelectorsFactory.get_selector_from_config( | ||
config_entry, args.model_name | ||
), | ||
) | ||
for config_entry in config | ||
] | ||
landmarker_based_selector = get_hp_selector_from_path( | ||
LandmarkerHpSelector, | ||
paths_provider.train_meta_dataset_path, | ||
paths_provider.hp_portfolio_configuratioons_path | ||
/ f"{args.model_name}_half_random.json", | ||
paths_provider.landmarkers_path / f"{args.model_name}.json", | ||
) | ||
|
||
with open( | ||
paths_provider.landmarkers_path / f"{args.model_name}.json" | ||
) as f: | ||
landmarkers_all = json.load(f) | ||
|
||
logger.info("Starting computation") | ||
experiment_results = [] | ||
for dataset_path in ( | ||
pbar := tqdm( | ||
list(sorted(paths_provider.val_meta_dataset_path.iterdir())) | ||
) | ||
): | ||
objective = objective_cls(*get_hpo_task_from_path(dataset_path)) | ||
pbar.set_postfix_str(extract_dataset_name_from_path(dataset_path)) | ||
hp_result = perform_ground_truth_warm_start_experiment( | ||
objective, | ||
Tensor( | ||
landmarkers_all[extract_dataset_name_from_path(dataset_path)] | ||
).cuda(), | ||
landmarker_based_selector, # type: ignore | ||
seed=1, | ||
n_trials=20, | ||
n_initial_trials=5, | ||
sampler_cls=sampler_cls, | ||
) | ||
hp_result["dataset"] = extract_dataset_name_from_path(dataset_path) | ||
hp_result["warmstart"] = "Landmarkers" | ||
experiment_results.append(hp_result) | ||
|
||
for selector_name, selector in selectors: | ||
hp_result = perform_warm_start_experiment( | ||
objective, | ||
selector, | ||
seed=1, | ||
n_trials=20, | ||
n_initial_trials=5, | ||
sampler_cls=sampler_cls, | ||
) | ||
hp_result["dataset"] = extract_dataset_name_from_path(dataset_path) | ||
hp_result["warmstart"] = selector_name | ||
experiment_results.append(hp_result) | ||
|
||
logger.info("Postprocessing results") | ||
experiment_results = pd.concat(experiment_results, axis=0).reset_index( | ||
drop=True | ||
) | ||
logger.info("Saving results") | ||
experiment_results.to_csv( | ||
paths_provider.warmstart_results_path / f"{args.model_name}.csv", | ||
index=False, | ||
) | ||
|
||
|
||
if __name__ == "__main__": | ||
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,70 @@ | ||
import json | ||
from operator import itemgetter | ||
|
||
import pytorch_lightning as pl | ||
from loguru import logger | ||
from openml import datasets, tasks | ||
from tqdm import tqdm | ||
|
||
from experiments_engine.data import ( | ||
is_eligible_task, | ||
move_target_to_last_column, | ||
) | ||
from experiments_engine.paths import paths_provider | ||
|
||
|
||
def main(): | ||
pl.seed_everything(123) | ||
logger.info("Loading tasks ids") | ||
with open(paths_provider.tasks_ids_path, "r") as f: | ||
tasks_ids = json.load(f) | ||
|
||
logger.info("Loading prohibited datasets names") | ||
with open(paths_provider.prohibited_datasets_path, "r") as f: | ||
prohibited_datasets = json.load(f) | ||
|
||
logger.info("Loading tasks") | ||
classification_tasks = tasks.list_tasks( | ||
task_type=tasks.TaskType.SUPERVISED_CLASSIFICATION | ||
) | ||
classification_tasks = list( | ||
map( | ||
itemgetter(1), | ||
filter( | ||
lambda item: item[0] in tasks_ids, | ||
classification_tasks.items(), | ||
), | ||
) | ||
) | ||
classification_tasks = list(filter(is_eligible_task, classification_tasks)) | ||
|
||
logger.info("Loading raw datasets") | ||
error_count = 0 | ||
for task in (pbar := tqdm(classification_tasks)): | ||
pbar.set_postfix( | ||
{ | ||
"features": task.get("NumberOfFeatures"), | ||
"instances": task.get("NumberOfInstances"), | ||
"classes": task.get("NumberOfClasses"), | ||
"errors": error_count, | ||
} | ||
) | ||
try: | ||
dataset = datasets.get_dataset(task["did"]) | ||
if dataset.name in prohibited_datasets: | ||
continue | ||
dataset_df = dataset.get_data()[0] | ||
dataset_df = move_target_to_last_column( | ||
dataset_df, task["target_feature"] # type: ignore | ||
) | ||
filename = task["name"] | ||
dataset_df.to_parquet( | ||
paths_provider.raw_datasets_path / f"{filename}.parquet", | ||
index=False, | ||
) | ||
except: # noqa E722 | ||
error_count += 1 | ||
|
||
|
||
if __name__ == "__main__": | ||
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,69 @@ | ||
from typing import Tuple | ||
|
||
import numpy as np | ||
import pandas as pd | ||
import pytorch_lightning as pl | ||
from loguru import logger | ||
from numpy import random | ||
from sklearn.datasets import make_classification | ||
from tqdm import tqdm | ||
|
||
from experiments_engine.paths import paths_provider | ||
|
||
|
||
def generate_random_dataset() -> Tuple[np.ndarray, np.ndarray]: | ||
n_samples = random.randint(1_000, 20_000) | ||
n_features = random.randint(2, 50) | ||
n_informative = random.randint(2, n_features + 1) | ||
one_class_weight = random.uniform() | ||
weights = [1 - one_class_weight, one_class_weight] | ||
flip_y = random.uniform(0, 0.5) | ||
return make_classification( | ||
n_samples=n_samples, | ||
n_features=n_features, | ||
n_informative=n_informative, | ||
n_redundant=n_features - n_informative, | ||
weights=weights, | ||
flip_y=flip_y, | ||
) | ||
|
||
|
||
def generate_random_number_of_datasets() -> list[pd.DataFrame]: | ||
X, y = generate_random_dataset() | ||
y = y.reshape(-1, 1) | ||
df = pd.DataFrame(data=np.concatenate([X, y], axis=1)) | ||
n_datasets = np.random.randint(1, 6) | ||
if n_datasets == 1: | ||
return [df] | ||
else: | ||
dataset_size = df.shape[0] // n_datasets | ||
dfs = [ | ||
df.iloc[ | ||
i | ||
* dataset_size : np.min( # noqa: E203 | ||
[(i + 1) * dataset_size, df.shape[0]] | ||
) | ||
] | ||
for i in range(n_datasets) | ||
] | ||
return dfs | ||
|
||
|
||
def main() -> None: | ||
pl.seed_everything(123) | ||
|
||
logger.info("Generating synthetic datasets") | ||
dataset_counter = 0 | ||
for _ in tqdm(range(200)): | ||
dfs = generate_random_number_of_datasets() | ||
for df in dfs: | ||
df.to_parquet( | ||
paths_provider.raw_datasets_path | ||
/ f"{dataset_counter:04d}.parquet", | ||
index=False, | ||
) | ||
dataset_counter += 1 | ||
|
||
|
||
if __name__ == "__main__": | ||
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,42 @@ | ||
import shutil | ||
|
||
import numpy as np | ||
import pytorch_lightning as pl | ||
from loguru import logger | ||
from tqdm import tqdm | ||
|
||
from experiments_engine.paths import paths_provider | ||
from experiments_engine.utils import extract_dataset_name_from_path | ||
|
||
|
||
def main(): | ||
pl.seed_everything(123) | ||
logger.info("Splitting to meta-train and meta-val") | ||
for dataset_path in tqdm(paths_provider.datasets_splitted_path.iterdir()): | ||
dataset_path_name = extract_dataset_name_from_path(dataset_path) | ||
if np.random.uniform() <= 0.3: | ||
shutil.copytree( | ||
paths_provider.datasets_splitted_path / dataset_path_name, | ||
paths_provider.val_meta_dataset_path / dataset_path_name, | ||
) | ||
shutil.copy( | ||
paths_provider.datasets_binarized_path | ||
/ f"{dataset_path_name}.parquet", | ||
paths_provider.val_meta_dataset_path_for_plain_d2v | ||
/ f"{dataset_path_name}.parquet", | ||
) | ||
else: | ||
shutil.copytree( | ||
paths_provider.datasets_splitted_path / dataset_path_name, | ||
paths_provider.train_meta_dataset_path / dataset_path_name, | ||
) | ||
shutil.copy( | ||
paths_provider.datasets_binarized_path | ||
/ f"{dataset_path_name}.parquet", | ||
paths_provider.train_meta_dataset_path_for_plain_d2v | ||
/ f"{dataset_path_name}.parquet", | ||
) | ||
|
||
|
||
if __name__ == "__main__": | ||
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,66 @@ | ||
import warnings | ||
|
||
import pandas as pd | ||
import pytorch_lightning as pl | ||
from loguru import logger | ||
from sklearn.model_selection import train_test_split | ||
from tqdm import tqdm | ||
|
||
from experiments_engine.data import ( | ||
clean_and_binarize_classification, | ||
remove_unwanted_columns, | ||
) | ||
from experiments_engine.paths import paths_provider | ||
from experiments_engine.utils import extract_dataset_name_from_path | ||
|
||
warnings.simplefilter(action="ignore") | ||
|
||
|
||
def main() -> None: | ||
pl.seed_everything(123) | ||
logger.info("Preprocessing & Saving tasks") | ||
for dataset_path in ( | ||
pbar := tqdm(sorted(paths_provider.raw_datasets_path.iterdir())) | ||
): | ||
df = pd.read_parquet(dataset_path) | ||
df = clean_and_binarize_classification(df) | ||
if (df.iloc[:, -1] == 1).sum() < 2 or (df.iloc[:, -1] == 0).sum() < 2: | ||
logger.warning( | ||
f"Skipping {dataset_path} due to too high class imbalance" | ||
) | ||
continue | ||
df = remove_unwanted_columns(df) | ||
df.to_parquet( | ||
paths_provider.datasets_binarized_path / dataset_path.name, | ||
index=False, | ||
) | ||
pbar.set_postfix( | ||
{ | ||
"shape": df.shape, | ||
"p": df.iloc[:, -1].mean(), | ||
"task_name": extract_dataset_name_from_path(dataset_path), | ||
} | ||
) | ||
|
||
logger.info("Splitting datasets") | ||
for dataset_path in ( | ||
pbar := tqdm(sorted(paths_provider.datasets_binarized_path.iterdir())) | ||
): | ||
dataset_name = extract_dataset_name_from_path(dataset_path) | ||
pbar.set_postfix({"dataset": dataset_name}) | ||
df = pd.read_parquet(dataset_path) | ||
df_train, df_test = train_test_split( | ||
df, stratify=df.iloc[:, -1], random_state=123 | ||
) | ||
output_dataset_path = ( | ||
paths_provider.datasets_splitted_path / dataset_name | ||
) | ||
output_dataset_path.mkdir(exist_ok=True, parents=True) | ||
df_train.to_parquet(output_dataset_path / "train.parquet", index=False) | ||
df_test.to_parquet(output_dataset_path / "test.parquet", index=False) | ||
|
||
logger.info("Finished") | ||
|
||
|
||
if __name__ == "__main__": | ||
main() |
Oops, something went wrong.