Skip to content

Commit

Permalink
Initial commit
Browse files Browse the repository at this point in the history
  • Loading branch information
azoz01 committed Aug 2, 2024
0 parents commit 8a61358
Show file tree
Hide file tree
Showing 47 changed files with 3,449 additions and 0 deletions.
8 changes: 8 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
**__pycache__**
*.db
errors
.vscode/
results
d2v*
!d2v*.py
.venv/
10 changes: 10 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
## Running code
### Setup
```
pip install -r requirements.txt
export PYTHONPATH=`pwd`
```
### Loading data
```
python bin/load_data.py
```
115 changes: 115 additions & 0 deletions bin/evaluate/perform_warmstart_experiment.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
import argparse
import json
import warnings

import pandas as pd
import pytorch_lightning as pl
import yaml
from loguru import logger
from optuna import samplers
from torch import Tensor
from tqdm import tqdm

import experiments_engine.hpo as hpo_cls_pkg
from experiments_engine.hp_selectors.baselines import LandmarkerHpSelector
from experiments_engine.hp_selectors.factory import (
SelectorsFactory,
get_hp_selector_from_path,
)
from experiments_engine.paths import paths_provider
from experiments_engine.utils import extract_dataset_name_from_path
from experiments_engine.warmstart_utils import (
get_hpo_task_from_path,
perform_ground_truth_warm_start_experiment,
perform_warm_start_experiment,
)

warnings.simplefilter("ignore")


def main():
pl.seed_everything(123)
logger.info("Parsing shell args")
parser = argparse.ArgumentParser()
parser.add_argument("--objective", type=str)
parser.add_argument("--model-name", type=str)
parser.add_argument("--sampler-name", type=str)
args = parser.parse_args()
with open(paths_provider.hp_selectors_path, "r") as f:
config = yaml.load(f, yaml.CLoader)
objective_cls = getattr(hpo_cls_pkg, args.objective)
sampler_cls = getattr(samplers, args.sampler_name)

logger.info("Initializing configurations selectors")
selectors = [
(
config_entry.pop("name"),
SelectorsFactory.get_selector_from_config(
config_entry, args.model_name
),
)
for config_entry in config
]
landmarker_based_selector = get_hp_selector_from_path(
LandmarkerHpSelector,
paths_provider.train_meta_dataset_path,
paths_provider.hp_portfolio_configuratioons_path
/ f"{args.model_name}_half_random.json",
paths_provider.landmarkers_path / f"{args.model_name}.json",
)

with open(
paths_provider.landmarkers_path / f"{args.model_name}.json"
) as f:
landmarkers_all = json.load(f)

logger.info("Starting computation")
experiment_results = []
for dataset_path in (
pbar := tqdm(
list(sorted(paths_provider.val_meta_dataset_path.iterdir()))
)
):
objective = objective_cls(*get_hpo_task_from_path(dataset_path))
pbar.set_postfix_str(extract_dataset_name_from_path(dataset_path))
hp_result = perform_ground_truth_warm_start_experiment(
objective,
Tensor(
landmarkers_all[extract_dataset_name_from_path(dataset_path)]
).cuda(),
landmarker_based_selector, # type: ignore
seed=1,
n_trials=20,
n_initial_trials=5,
sampler_cls=sampler_cls,
)
hp_result["dataset"] = extract_dataset_name_from_path(dataset_path)
hp_result["warmstart"] = "Landmarkers"
experiment_results.append(hp_result)

for selector_name, selector in selectors:
hp_result = perform_warm_start_experiment(
objective,
selector,
seed=1,
n_trials=20,
n_initial_trials=5,
sampler_cls=sampler_cls,
)
hp_result["dataset"] = extract_dataset_name_from_path(dataset_path)
hp_result["warmstart"] = selector_name
experiment_results.append(hp_result)

logger.info("Postprocessing results")
experiment_results = pd.concat(experiment_results, axis=0).reset_index(
drop=True
)
logger.info("Saving results")
experiment_results.to_csv(
paths_provider.warmstart_results_path / f"{args.model_name}.csv",
index=False,
)


if __name__ == "__main__":
main()
70 changes: 70 additions & 0 deletions bin/load_data/download_data_from_openml.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
import json
from operator import itemgetter

import pytorch_lightning as pl
from loguru import logger
from openml import datasets, tasks
from tqdm import tqdm

from experiments_engine.data import (
is_eligible_task,
move_target_to_last_column,
)
from experiments_engine.paths import paths_provider


def main():
pl.seed_everything(123)
logger.info("Loading tasks ids")
with open(paths_provider.tasks_ids_path, "r") as f:
tasks_ids = json.load(f)

logger.info("Loading prohibited datasets names")
with open(paths_provider.prohibited_datasets_path, "r") as f:
prohibited_datasets = json.load(f)

logger.info("Loading tasks")
classification_tasks = tasks.list_tasks(
task_type=tasks.TaskType.SUPERVISED_CLASSIFICATION
)
classification_tasks = list(
map(
itemgetter(1),
filter(
lambda item: item[0] in tasks_ids,
classification_tasks.items(),
),
)
)
classification_tasks = list(filter(is_eligible_task, classification_tasks))

logger.info("Loading raw datasets")
error_count = 0
for task in (pbar := tqdm(classification_tasks)):
pbar.set_postfix(
{
"features": task.get("NumberOfFeatures"),
"instances": task.get("NumberOfInstances"),
"classes": task.get("NumberOfClasses"),
"errors": error_count,
}
)
try:
dataset = datasets.get_dataset(task["did"])
if dataset.name in prohibited_datasets:
continue
dataset_df = dataset.get_data()[0]
dataset_df = move_target_to_last_column(
dataset_df, task["target_feature"] # type: ignore
)
filename = task["name"]
dataset_df.to_parquet(
paths_provider.raw_datasets_path / f"{filename}.parquet",
index=False,
)
except: # noqa E722
error_count += 1


if __name__ == "__main__":
main()
69 changes: 69 additions & 0 deletions bin/load_data/generate_synthetic_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
from typing import Tuple

import numpy as np
import pandas as pd
import pytorch_lightning as pl
from loguru import logger
from numpy import random
from sklearn.datasets import make_classification
from tqdm import tqdm

from experiments_engine.paths import paths_provider


def generate_random_dataset() -> Tuple[np.ndarray, np.ndarray]:
n_samples = random.randint(1_000, 20_000)
n_features = random.randint(2, 50)
n_informative = random.randint(2, n_features + 1)
one_class_weight = random.uniform()
weights = [1 - one_class_weight, one_class_weight]
flip_y = random.uniform(0, 0.5)
return make_classification(
n_samples=n_samples,
n_features=n_features,
n_informative=n_informative,
n_redundant=n_features - n_informative,
weights=weights,
flip_y=flip_y,
)


def generate_random_number_of_datasets() -> list[pd.DataFrame]:
X, y = generate_random_dataset()
y = y.reshape(-1, 1)
df = pd.DataFrame(data=np.concatenate([X, y], axis=1))
n_datasets = np.random.randint(1, 6)
if n_datasets == 1:
return [df]
else:
dataset_size = df.shape[0] // n_datasets
dfs = [
df.iloc[
i
* dataset_size : np.min( # noqa: E203
[(i + 1) * dataset_size, df.shape[0]]
)
]
for i in range(n_datasets)
]
return dfs


def main() -> None:
pl.seed_everything(123)

logger.info("Generating synthetic datasets")
dataset_counter = 0
for _ in tqdm(range(200)):
dfs = generate_random_number_of_datasets()
for df in dfs:
df.to_parquet(
paths_provider.raw_datasets_path
/ f"{dataset_counter:04d}.parquet",
index=False,
)
dataset_counter += 1


if __name__ == "__main__":
main()
42 changes: 42 additions & 0 deletions bin/load_data/meta_split.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
import shutil

import numpy as np
import pytorch_lightning as pl
from loguru import logger
from tqdm import tqdm

from experiments_engine.paths import paths_provider
from experiments_engine.utils import extract_dataset_name_from_path


def main():
pl.seed_everything(123)
logger.info("Splitting to meta-train and meta-val")
for dataset_path in tqdm(paths_provider.datasets_splitted_path.iterdir()):
dataset_path_name = extract_dataset_name_from_path(dataset_path)
if np.random.uniform() <= 0.3:
shutil.copytree(
paths_provider.datasets_splitted_path / dataset_path_name,
paths_provider.val_meta_dataset_path / dataset_path_name,
)
shutil.copy(
paths_provider.datasets_binarized_path
/ f"{dataset_path_name}.parquet",
paths_provider.val_meta_dataset_path_for_plain_d2v
/ f"{dataset_path_name}.parquet",
)
else:
shutil.copytree(
paths_provider.datasets_splitted_path / dataset_path_name,
paths_provider.train_meta_dataset_path / dataset_path_name,
)
shutil.copy(
paths_provider.datasets_binarized_path
/ f"{dataset_path_name}.parquet",
paths_provider.train_meta_dataset_path_for_plain_d2v
/ f"{dataset_path_name}.parquet",
)


if __name__ == "__main__":
main()
66 changes: 66 additions & 0 deletions bin/load_data/preprocess_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
import warnings

import pandas as pd
import pytorch_lightning as pl
from loguru import logger
from sklearn.model_selection import train_test_split
from tqdm import tqdm

from experiments_engine.data import (
clean_and_binarize_classification,
remove_unwanted_columns,
)
from experiments_engine.paths import paths_provider
from experiments_engine.utils import extract_dataset_name_from_path

warnings.simplefilter(action="ignore")


def main() -> None:
pl.seed_everything(123)
logger.info("Preprocessing & Saving tasks")
for dataset_path in (
pbar := tqdm(sorted(paths_provider.raw_datasets_path.iterdir()))
):
df = pd.read_parquet(dataset_path)
df = clean_and_binarize_classification(df)
if (df.iloc[:, -1] == 1).sum() < 2 or (df.iloc[:, -1] == 0).sum() < 2:
logger.warning(
f"Skipping {dataset_path} due to too high class imbalance"
)
continue
df = remove_unwanted_columns(df)
df.to_parquet(
paths_provider.datasets_binarized_path / dataset_path.name,
index=False,
)
pbar.set_postfix(
{
"shape": df.shape,
"p": df.iloc[:, -1].mean(),
"task_name": extract_dataset_name_from_path(dataset_path),
}
)

logger.info("Splitting datasets")
for dataset_path in (
pbar := tqdm(sorted(paths_provider.datasets_binarized_path.iterdir()))
):
dataset_name = extract_dataset_name_from_path(dataset_path)
pbar.set_postfix({"dataset": dataset_name})
df = pd.read_parquet(dataset_path)
df_train, df_test = train_test_split(
df, stratify=df.iloc[:, -1], random_state=123
)
output_dataset_path = (
paths_provider.datasets_splitted_path / dataset_name
)
output_dataset_path.mkdir(exist_ok=True, parents=True)
df_train.to_parquet(output_dataset_path / "train.parquet", index=False)
df_test.to_parquet(output_dataset_path / "test.parquet", index=False)

logger.info("Finished")


if __name__ == "__main__":
main()
Loading

0 comments on commit 8a61358

Please sign in to comment.