diff --git a/CHANGELOG.md b/CHANGELOG.md
index 933559f76d4..69848882145 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -11,6 +11,7 @@ All notable changes to this project will be documented in this file.
 ### Enhancements
 
 - Upgrade NNCF to 2.7 and OpenVINO to 2023.2 (<https://github.com/openvinotoolkit/training_extensions/pull/2656>)
+- Automate performance benchmark (<https://github.com/openvinotoolkit/training_extensions/pull/2742>)
 
 ## \[v1.5.0\]
 
diff --git a/tests/perf/__init__.py b/tests/perf/__init__.py
new file mode 100644
index 00000000000..9984d0cb25b
--- /dev/null
+++ b/tests/perf/__init__.py
@@ -0,0 +1,4 @@
+"""OTX Perfomance tests."""
+
+# Copyright (C) 2023 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
diff --git a/tests/perf/benchmark.py b/tests/perf/benchmark.py
new file mode 100644
index 00000000000..f39ed806731
--- /dev/null
+++ b/tests/perf/benchmark.py
@@ -0,0 +1,202 @@
+"""OTX Benchmark based on tools/experiment.py."""
+
+# Copyright (C) 2023 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+
+import os
+import glob
+import pandas as pd
+import yaml
+from pathlib import Path
+from typing import List, Optional
+
+from tests.test_suite.run_test_command import check_run
+
+
+class OTXBenchmark:
+    """Benchmark runner based on tools/experiment.py in OTX1.x.
+
+    Example:
+        >>> bm = OTXBenchmark(['random_sample1', 'random_sample2'], data_root='./data/coco')
+        >>> atss_result = bm.run('MobileNetV2-ATSS')
+        >>> yolox_result = bm.run('YOLOX-TINY')
+
+    Args:
+        datasets (List[str]): Paths to datasets relative to the data_root.
+            Intended for, but not restricted to different sampling based on same dataset.
+        data_root (str): Path to the root of dataset directories. Defaults to './data'.
+        num_epoch (int): Overrides the per-model default number of epoch settings.
+            Defaults to 0, which means no overriding.
+        num_repeat (int): Number for trials with different random seed, which would be set
+            as range(0, num_repeat). Defaults to 1.
+        train_params (dict, optional): Additional training parameters.
+            e.x) {'learning_parameters.num_iters': 2}. Defaults to {}.
+        track_resources (bool): Whether to track CPU & GPU usage metrics. Defaults to False.
+        eval_upto (str): The last serial operation to evaluate. Choose one of ('train', 'export', 'optimize').
+            Operations include the preceeding ones.
+            e.x) Eval up to 'optimize': train -> eval -> export -> eval -> optimize -> eval
+            Default to 'train'.
+        output_root (str): Output root dirctory for logs and results. Defaults to './otx-benchmark'.
+        dry_run (bool): Whether to just print the OTX command without execution. Defaults to False.
+        tags (dict, optional): Key-values pair metadata for the experiment.
+        subset_dir_names (dict, optional): Specify dataset subset directory names, if any.
+            e.x) {"train": "train_10percent", "val": "val_all", "test": "test"}
+    """
+
+    def __init__(
+        self,
+        datasets: List[str],
+        data_root: str = "data",
+        num_epoch: int = 0,
+        num_repeat: int = 1,
+        train_params: dict | None = None,
+        track_resources: bool = False,
+        eval_upto: str = "train",
+        output_root: str = "otx-benchmark",
+        dry_run: bool = False,
+        tags: dict | None = None,
+        subset_dir_names: dict | None = None,
+    ):
+        self.datasets = datasets
+        self.data_root = data_root
+        self.num_epoch = num_epoch
+        self.num_repeat = num_repeat
+        self.train_params = train_params or {}
+        self.track_resources = track_resources
+        self.eval_upto = eval_upto
+        self.output_root = output_root
+        self.dry_run = dry_run
+        self.tags = tags or {}
+        self.subset_dir_names = subset_dir_names or {"train": "", "val": "", "test": ""}
+
+    def run(
+        self,
+        model_id: str,
+        train_params: dict = {},
+        tags: dict = {},
+    ) -> pd.DataFrame | None:
+        """Run configured benchmark with given model and return the result.
+
+        Args:
+            model_id (str): Target model identifier
+            train_params (dict): Overrides global benchmark train params
+            tags (dict): Overrides global benchmark tags
+
+        Retruns:
+            pd.DataFrame | None: Table with benchmark metrics
+        """
+
+        # Build config file
+        cfg = self._build_config(model_id, train_params, tags)
+        cfg_dir = Path(cfg["output_path"])
+        cfg_dir.mkdir(parents=True, exist_ok=True)
+        cfg_path = cfg_dir / "cfg.yaml"
+        with open(cfg_path, "w") as cfg_file:
+            yaml.dump(cfg, cfg_file, indent=2)
+        cmd = [
+            "python",
+            "tools/experiment.py",
+            "-f",
+            cfg_path,
+        ]
+        if self.dry_run:
+            cmd.append("-d")
+        # Run benchmark
+        check_run(cmd)
+        # Load result
+        result = self.load_result(cfg_dir)
+        return result
+
+    @staticmethod
+    def load_result(result_path: str) -> pd.DataFrame | None:
+        """Load benchmark results recursively and merge as pd.DataFrame.
+
+        Args:
+            result_path (str): Result directory or speicific file.
+
+        Retruns:
+            pd.DataFrame: Table with benchmark metrics & options
+        """
+        # Search csv files
+        if os.path.isdir(result_path):
+            csv_file_paths = glob.glob(f"{result_path}/**/exp_summary.csv", recursive=True)
+        else:
+            csv_file_paths = [result_path]
+        results = []
+        # Load csv data
+        for csv_file_path in csv_file_paths:
+            result = pd.read_csv(csv_file_path)
+            # Append metadata if any
+            cfg_file_path = Path(csv_file_path).parent / "cfg.yaml"
+            if cfg_file_path.exists():
+                with cfg_file_path.open("r") as cfg_file:
+                    tags = yaml.safe_load(cfg_file).get("tags", {})
+                    for k, v in tags.items():
+                        result[k] = v
+            results.append(result)
+        if len(results) > 0:
+            return pd.concat(results, ignore_index=True)
+        else:
+            return None
+
+    def _build_config(
+        self,
+        model_id: str,
+        train_params: dict = {},
+        tags: dict = {},
+    ) -> dict:
+        """Build config for tools/expeirment.py."""
+        all_train_params = self.train_params.copy()
+        all_train_params.update(train_params)
+        all_tags = self.tags.copy()
+        all_tags.update(tags)
+
+        cfg = {}
+        cfg["tags"] = all_tags  # metadata
+        cfg["output_path"] = os.path.abspath(Path(self.output_root) / "-".join(list(all_tags.values()) + [model_id]))
+        cfg["constants"] = {
+            "dataroot": os.path.abspath(self.data_root),
+        }
+        cfg["variables"] = {
+            "model": [model_id],
+            "data": self.datasets,
+        }
+        cfg["repeat"] = self.num_repeat
+        cfg["command"] = []
+        resource_param = ""
+        if self.track_resources:
+            resource_param = "--track-resource-usage all"
+        if self.num_epoch > 0:
+            self._set_num_epoch(model_id, all_train_params, self.num_epoch)
+        params_str = " ".join([f"--{k} {v}" for k, v in all_train_params.items()])
+        cfg["command"].append(
+            "otx train ${model}"
+            " --train-data-roots ${dataroot}/${data}" + f"/{self.subset_dir_names['train']}"
+            " --val-data-roots ${dataroot}/${data}" + f"/{self.subset_dir_names['val']}"
+            " --deterministic"
+            f" {resource_param}"
+            f" params {params_str}"
+        )
+        cfg["command"].append("otx eval --test-data-roots ${dataroot}/${data}" + f"/{self.subset_dir_names['test']}")
+        if self.eval_upto == "train":
+            return cfg
+
+        cfg["command"].append("otx export")
+        cfg["command"].append("otx eval --test-data-roots ${dataroot}/${data}" + f"/{self.subset_dir_names['test']}")
+        if self.eval_upto == "export":
+            return cfg
+
+        cfg["command"].append("otx optimize")
+        cfg["command"].append("otx eval --test-data-roots ${dataroot}/${data}" + f"/{self.subset_dir_names['test']}")
+        return cfg
+
+    @staticmethod
+    def _set_num_epoch(model_id: str, train_params: dict, num_epoch: int):
+        """Set model specific num_epoch parameter."""
+        if "padim" in model_id:
+            return  # No configurable parameter for num_epoch
+        elif "stfpm" in model_id:
+            train_params["learning_parameters.max_epochs"] = num_epoch
+        else:
+            train_params["learning_parameters.num_iters"] = num_epoch
diff --git a/tests/perf/conftest.py b/tests/perf/conftest.py
new file mode 100644
index 00000000000..0d831d50dd1
--- /dev/null
+++ b/tests/perf/conftest.py
@@ -0,0 +1,146 @@
+# Copyright (C) 2023 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+
+import pytest
+import os
+import subprocess
+import yaml
+from pathlib import Path
+from typing import List
+from datetime import datetime
+
+from otx.api.entities.model_template import ModelTemplate, ModelCategory
+from .benchmark import OTXBenchmark
+
+
+def pytest_addoption(parser):
+    """Add custom options for perf tests."""
+    parser.addoption(
+        "--model-type",
+        action="store",
+        default="all",
+        choices=("default", "all"),
+        help="Choose default|all. Defaults to all.",
+    )
+    parser.addoption(
+        "--data-size",
+        action="store",
+        default="all",
+        choices=("small", "medium", "large", "all"),
+        help="Choose small|medium|large|all. Defaults to all.",
+    )
+    parser.addoption(
+        "--num-repeat",
+        action="store",
+        default=0,
+        help="Overrides default per-data-size number of repeat setting. "
+        "Random seeds are set to 0 ~ num_repeat-1 for the trials. "
+        "Defaults to 0 (small=3, medium=3, large=1).",
+    )
+    parser.addoption(
+        "--num-epoch",
+        action="store",
+        default=0,
+        help="Overrides default per-model number of epoch setting. "
+        "Defaults to 0 (per-model epoch & early-stopping).",
+    )
+    parser.addoption(
+        "--eval-upto",
+        action="store",
+        default="train",
+        choices=("train", "export", "optimize"),
+        help="Choose train|export|optimize. Defaults to train.",
+    )
+    parser.addoption(
+        "--data-root",
+        action="store",
+        default="data",
+        help="Dataset root directory.",
+    )
+    parser.addoption(
+        "--output-root",
+        action="store",
+        default="exp/perf",
+        help="Output root directory.",
+    )
+    parser.addoption(
+        "--dry-run",
+        action="store_true",
+        default=False,
+        help="Print OTX commands without execution.",
+    )
+
+
+@pytest.fixture(scope="session")
+def fxt_output_root(request: pytest.FixtureRequest) -> Path:
+    """Output root + date + short commit hash."""
+    output_root = request.config.getoption("--output-root")
+    data_str = datetime.now().strftime("%Y%m%d-%H%M%S")
+    commit_str = subprocess.check_output(["git", "rev-parse", "--short", "HEAD"]).decode("ascii").strip()
+    return Path(output_root) / (data_str + "-" + commit_str)
+
+
+@pytest.fixture
+def fxt_model_id(request: pytest.FixtureRequest) -> str:
+    """Skip by model category."""
+    model_type: str = request.config.getoption("--model-type")
+    model_template: ModelTemplate = request.param
+    if model_type == "default":
+        if model_template.model_category == ModelCategory.OTHER:
+            pytest.skip(f"{model_template.model_category} category model")
+    return model_template.model_template_id
+
+
+@pytest.fixture
+def fxt_benchmark(request: pytest.FixtureRequest, fxt_output_root: Path) -> OTXBenchmark:
+    """Configure benchmark."""
+    # Skip by dataset size
+    data_size_option: str = request.config.getoption("--data-size")
+    data_size: str = request.param[0]
+    if data_size_option != "all":
+        if data_size_option != data_size:
+            pytest.skip(f"{data_size} datasets")
+
+    # Options
+    cfg: dict = request.param[1].copy()
+
+    tags = cfg.get("tags", {})
+    tags["data_size"] = data_size
+    cfg["tags"] = tags
+
+    num_epoch_override: int = int(request.config.getoption("--num-epoch"))
+    if num_epoch_override > 0:  # 0: use default
+        cfg["num_epoch"] = num_epoch_override
+    if "test_speed" in request.node.name:
+        if cfg.get("num_epoch", 0) == 0:  # No user options
+            cfg["num_epoch"] = 2
+
+    num_repeat_override: int = int(request.config.getoption("--num-repeat"))
+    if num_repeat_override > 0:  # 0: use default
+        cfg["num_repeat"] = num_repeat_override
+
+    cfg["eval_upto"] = request.config.getoption("--eval-upto")
+    cfg["data_root"] = request.config.getoption("--data-root")
+    cfg["output_root"] = str(fxt_output_root)
+    cfg["dry_run"] = request.config.getoption("--dry-run")
+
+    # Create benchmark
+    benchmark = OTXBenchmark(
+        **cfg,
+    )
+
+    return benchmark
+
+
+@pytest.fixture(scope="session", autouse=True)
+def fxt_benchmark_summary(fxt_output_root: Path):
+    """Summarize all results at the end of test session."""
+    yield
+    all_results = OTXBenchmark.load_result(fxt_output_root)
+    if all_results is not None:
+        print("=" * 20, "[Benchmark summary]")
+        print(all_results)
+        output_path = fxt_output_root / "benchmark-summary.csv"
+        all_results.to_csv(output_path, index=False)
+        print(f"  -> Saved to {output_path}.")
diff --git a/tests/perf/test_anomaly.py b/tests/perf/test_anomaly.py
new file mode 100644
index 00000000000..db16f7172ea
--- /dev/null
+++ b/tests/perf/test_anomaly.py
@@ -0,0 +1,184 @@
+"""OTX Anomaly perfomance tests."""
+
+# Copyright (C) 2023 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+
+import pytest
+
+from otx.cli.registry import Registry
+from .benchmark import OTXBenchmark
+
+
+class TestPerfAnomalyClassification:
+    """Benchmark anomaly classification."""
+
+    MODEL_TEMPLATES = Registry(f"src/otx/algorithms").filter(task_type="ANOMALY_CLASSIFICATION").templates
+    MODEL_IDS = [template.model_template_id for template in MODEL_TEMPLATES]
+
+    BENCHMARK_CONFIGS = {
+        "small": {
+            "tags": {
+                "task": "anomaly_classification",
+            },
+            "datasets": [
+                "anomaly/mvtec/bottle_small/1",
+                "anomaly/mvtec/bottle_small/2",
+                "anomaly/mvtec/bottle_small/3",
+            ],
+            "num_repeat": 3,
+        },
+        "medium": {
+            "tags": {
+                "task": "anomaly_classification",
+            },
+            "datasets": [
+                "anomaly/mvtec/wood_medium",
+            ],
+            "num_repeat": 3,
+        },
+        "large": {
+            "tags": {
+                "task": "anomaly_classification",
+            },
+            "datasets": [
+                "anomaly/mvtec/hazelnut_large",
+            ],
+            "num_repeat": 1,
+        },
+    }
+
+    @pytest.mark.parametrize("fxt_model_id", MODEL_TEMPLATES, ids=MODEL_IDS, indirect=True)
+    @pytest.mark.parametrize("fxt_benchmark", BENCHMARK_CONFIGS.items(), ids=BENCHMARK_CONFIGS.keys(), indirect=True)
+    def test_accuracy(self, fxt_model_id: str, fxt_benchmark: OTXBenchmark):
+        """Benchmark accruacy metrics."""
+        result = fxt_benchmark.run(
+            model_id=fxt_model_id,
+            tags={"benchmark": "accuracy"},
+        )
+
+    @pytest.mark.parametrize("fxt_model_id", MODEL_TEMPLATES, ids=MODEL_IDS, indirect=True)
+    @pytest.mark.parametrize("fxt_benchmark", BENCHMARK_CONFIGS.items(), ids=BENCHMARK_CONFIGS.keys(), indirect=True)
+    def test_speed(self, fxt_model_id: str, fxt_benchmark: OTXBenchmark):
+        """Benchmark train time per iter / infer time per image."""
+        fxt_benchmark.track_resources = True
+        result = fxt_benchmark.run(
+            model_id=fxt_model_id,
+            tags={"benchmark": "speed"},
+        )
+
+
+class TestPerfAnomalyDetection:
+    """Benchmark anomaly detection."""
+
+    MODEL_TEMPLATES = Registry(f"src/otx/algorithms").filter(task_type="ANOMALY_DETECTION").templates
+    MODEL_IDS = [template.model_template_id for template in MODEL_TEMPLATES]
+
+    BENCHMARK_CONFIGS = {
+        "small": {
+            "tags": {
+                "task": "anomaly_detection",
+            },
+            "datasets": [
+                "anomaly/mvtec/bottle_small/1",
+                "anomaly/mvtec/bottle_small/2",
+                "anomaly/mvtec/bottle_small/3",
+            ],
+            "num_repeat": 3,
+        },
+        "medium": {
+            "tags": {
+                "task": "anomaly_detection",
+            },
+            "datasets": [
+                "anomaly/mvtec/wood_medium",
+            ],
+            "num_repeat": 3,
+        },
+        "large": {
+            "tags": {
+                "task": "anomaly_detection",
+            },
+            "datasets": [
+                "anomaly/mvtec/hazelnut_large",
+            ],
+            "num_repeat": 1,
+        },
+    }
+
+    @pytest.mark.parametrize("fxt_model_id", MODEL_TEMPLATES, ids=MODEL_IDS, indirect=True)
+    @pytest.mark.parametrize("fxt_benchmark", BENCHMARK_CONFIGS.items(), ids=BENCHMARK_CONFIGS.keys(), indirect=True)
+    def test_accuracy(self, fxt_model_id: str, fxt_benchmark: OTXBenchmark):
+        """Benchmark accruacy metrics."""
+        result = fxt_benchmark.run(
+            model_id=fxt_model_id,
+            tags={"benchmark": "accuracy"},
+        )
+
+    @pytest.mark.parametrize("fxt_model_id", MODEL_TEMPLATES, ids=MODEL_IDS, indirect=True)
+    @pytest.mark.parametrize("fxt_benchmark", BENCHMARK_CONFIGS.items(), ids=BENCHMARK_CONFIGS.keys(), indirect=True)
+    def test_speed(self, fxt_model_id: str, fxt_benchmark: OTXBenchmark):
+        """Benchmark train time per iter / infer time per image."""
+        fxt_benchmark.track_resources = True
+        result = fxt_benchmark.run(
+            model_id=fxt_model_id,
+            tags={"benchmark": "speed"},
+        )
+
+
+class TestPerfAnomalySegmentation:
+    """Benchmark anomaly segmentation."""
+
+    MODEL_TEMPLATES = Registry(f"src/otx/algorithms").filter(task_type="ANOMALY_SEGMENTATION").templates
+    MODEL_IDS = [template.model_template_id for template in MODEL_TEMPLATES]
+
+    BENCHMARK_CONFIGS = {
+        "small": {
+            "tags": {
+                "task": "anomaly_segmentation",
+            },
+            "datasets": [
+                "anomaly/mvtec/bottle_small/1",
+                "anomaly/mvtec/bottle_small/2",
+                "anomaly/mvtec/bottle_small/3",
+            ],
+            "num_repeat": 3,
+        },
+        "medium": {
+            "tags": {
+                "task": "anomaly_segmentation",
+            },
+            "datasets": [
+                "anomaly/mvtec/wood_medium",
+            ],
+            "num_repeat": 3,
+        },
+        "large": {
+            "tags": {
+                "task": "anomaly_segmentation",
+            },
+            "datasets": [
+                "anomaly/mvtec/hazelnut_large",
+            ],
+            "num_repeat": 1,
+        },
+    }
+
+    @pytest.mark.parametrize("fxt_model_id", MODEL_TEMPLATES, ids=MODEL_IDS, indirect=True)
+    @pytest.mark.parametrize("fxt_benchmark", BENCHMARK_CONFIGS.items(), ids=BENCHMARK_CONFIGS.keys(), indirect=True)
+    def test_accuracy(self, fxt_model_id: str, fxt_benchmark: OTXBenchmark):
+        """Benchmark accruacy metrics."""
+        result = fxt_benchmark.run(
+            model_id=fxt_model_id,
+            tags={"benchmark": "accuracy"},
+        )
+
+    @pytest.mark.parametrize("fxt_model_id", MODEL_TEMPLATES, ids=MODEL_IDS, indirect=True)
+    @pytest.mark.parametrize("fxt_benchmark", BENCHMARK_CONFIGS.items(), ids=BENCHMARK_CONFIGS.keys(), indirect=True)
+    def test_speed(self, fxt_model_id: str, fxt_benchmark: OTXBenchmark):
+        """Benchmark train time per iter / infer time per image."""
+        fxt_benchmark.track_resources = True
+        result = fxt_benchmark.run(
+            model_id=fxt_model_id,
+            tags={"benchmark": "speed"},
+        )
diff --git a/tests/perf/test_classification.py b/tests/perf/test_classification.py
new file mode 100644
index 00000000000..15a9b4dd133
--- /dev/null
+++ b/tests/perf/test_classification.py
@@ -0,0 +1,179 @@
+"""OTX Classification perfomance tests."""
+
+# Copyright (C) 2023 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+
+import pytest
+
+from otx.cli.registry import Registry
+from .benchmark import OTXBenchmark
+
+
+MODEL_TEMPLATES = Registry(f"src/otx/algorithms").filter(task_type="CLASSIFICATION").templates
+MODEL_IDS = [template.model_template_id for template in MODEL_TEMPLATES]
+
+
+class TestPerfSingleLabelClassification:
+    """Benchmark single-label classification."""
+
+    BENCHMARK_CONFIGS = {
+        "small": {
+            "tags": {
+                "task": "single_label_classification",
+            },
+            "datasets": [
+                "classification/single_label/multiclass_CUB_small/1",
+                "classification/single_label/multiclass_CUB_small/2",
+                "classification/single_label/multiclass_CUB_small/3",
+            ],
+            "num_repeat": 3,
+        },
+        "medium": {
+            "tags": {
+                "task": "single_label_classification",
+            },
+            "datasets": [
+                "classification/single_label/multiclass_CUB_medium",
+            ],
+            "num_repeat": 3,
+        },
+        "large": {
+            "tags": {
+                "task": "single_label_classification",
+            },
+            "datasets": [
+                "classification/single_label/multiclass_food101_large",
+            ],
+            "num_repeat": 1,
+        },
+    }
+
+    @pytest.mark.parametrize("fxt_model_id", MODEL_TEMPLATES, ids=MODEL_IDS, indirect=True)
+    @pytest.mark.parametrize("fxt_benchmark", BENCHMARK_CONFIGS.items(), ids=BENCHMARK_CONFIGS.keys(), indirect=True)
+    def test_accuracy(self, fxt_model_id: str, fxt_benchmark: OTXBenchmark):
+        """Benchmark accruacy metrics."""
+        result = fxt_benchmark.run(
+            model_id=fxt_model_id,
+            tags={"benchmark": "accuracy"},
+        )
+
+    @pytest.mark.parametrize("fxt_model_id", MODEL_TEMPLATES, ids=MODEL_IDS, indirect=True)
+    @pytest.mark.parametrize("fxt_benchmark", BENCHMARK_CONFIGS.items(), ids=BENCHMARK_CONFIGS.keys(), indirect=True)
+    def test_speed(self, fxt_model_id: str, fxt_benchmark: OTXBenchmark):
+        """Benchmark train time per iter / infer time per image."""
+        fxt_benchmark.track_resources = True
+        result = fxt_benchmark.run(
+            model_id=fxt_model_id,
+            tags={"benchmark": "speed"},
+        )
+
+
+class TestPerfMultiLabelClassification:
+    """Benchmark multi-label classification."""
+
+    BENCHMARK_CONFIGS = {
+        "small": {
+            "tags": {
+                "task": "multi_label_classification",
+            },
+            "datasets": [
+                "classification/multi_label/multilabel_CUB_small/1",
+                "classification/multi_label/multilabel_CUB_small/2",
+                "classification/multi_label/multilabel_CUB_small/3",
+            ],
+            "num_repeat": 3,
+        },
+        "medium": {
+            "tags": {
+                "task": "multi_label_classification",
+            },
+            "datasets": [
+                "classification/multi_label/multilabel_CUB_medium",
+            ],
+            "num_repeat": 3,
+        },
+        "large": {
+            "tags": {
+                "task": "multi_label_classification",
+            },
+            "datasets": [
+                "classification/multi_label/multilabel_food101_large",
+            ],
+            "num_repeat": 1,
+        },
+    }
+
+    @pytest.mark.parametrize("fxt_model_id", MODEL_TEMPLATES, ids=MODEL_IDS, indirect=True)
+    @pytest.mark.parametrize("fxt_benchmark", BENCHMARK_CONFIGS.items(), ids=BENCHMARK_CONFIGS.keys(), indirect=True)
+    def test_accuracy(self, fxt_model_id: str, fxt_benchmark: OTXBenchmark):
+        """Benchmark accruacy metrics."""
+        result = fxt_benchmark.run(
+            model_id=fxt_model_id,
+            tags={"benchmark": "accuracy"},
+        )
+
+    @pytest.mark.parametrize("fxt_model_id", MODEL_TEMPLATES, ids=MODEL_IDS, indirect=True)
+    @pytest.mark.parametrize("fxt_benchmark", BENCHMARK_CONFIGS.items(), ids=BENCHMARK_CONFIGS.keys(), indirect=True)
+    def test_speed(self, fxt_model_id: str, fxt_benchmark: OTXBenchmark):
+        """Benchmark train time per iter / infer time per image."""
+        fxt_benchmark.track_resources = True
+        result = fxt_benchmark.run(
+            model_id=fxt_model_id,
+            tags={"benchmark": "speed"},
+        )
+
+
+class TestPerfHierarchicalLabelClassification:
+    """Benchmark hierarchcial-label classification."""
+
+    BENCHMARK_CONFIGS = {
+        "small": {
+            "tags": {
+                "task": "hierarchical_label_classification",
+            },
+            "datasets": [
+                "classification/h_label/h_label_CUB_small/1",
+                "classification/h_label/h_label_CUB_small/2",
+                "classification/h_label/h_label_CUB_small/3",
+            ],
+            "num_repeat": 3,
+        },
+        "medium": {
+            "tags": {
+                "task": "hierarchical_label_classification",
+            },
+            "datasets": [
+                "classification/h_label/h_label_CUB_medium",
+            ],
+            "num_repeat": 3,
+        },
+        # TODO: Add large dataset
+        # "large": {
+        #     "tags": {
+        #         "task": "hierarchical_label_classification",
+        #     },
+        #     "datasets": [
+        #     ],
+        #     "num_repeat": 1,
+        # },
+    }
+
+    @pytest.mark.parametrize("fxt_model_id", MODEL_TEMPLATES, ids=MODEL_IDS, indirect=True)
+    @pytest.mark.parametrize("fxt_benchmark", BENCHMARK_CONFIGS.items(), ids=BENCHMARK_CONFIGS.keys(), indirect=True)
+    def test_accuracy(self, fxt_model_id: str, fxt_benchmark: OTXBenchmark):
+        """Benchmark accruacy metrics."""
+        result = fxt_benchmark.run(
+            model_id=fxt_model_id,
+            tags={"benchmark": "accuracy"},
+        )
+
+    @pytest.mark.parametrize("fxt_model_id", MODEL_TEMPLATES, ids=MODEL_IDS, indirect=True)
+    @pytest.mark.parametrize("fxt_benchmark", BENCHMARK_CONFIGS.items(), ids=BENCHMARK_CONFIGS.keys(), indirect=True)
+    def test_speed(self, fxt_model_id: str, fxt_benchmark: OTXBenchmark):
+        """Benchmark train time per iter / infer time per image."""
+        fxt_benchmark.track_resources = True
+        result = fxt_benchmark.run(
+            model_id=fxt_model_id,
+            tags={"benchmark": "speed"},
+        )
diff --git a/tests/perf/test_detection.py b/tests/perf/test_detection.py
new file mode 100644
index 00000000000..81ed71c0bac
--- /dev/null
+++ b/tests/perf/test_detection.py
@@ -0,0 +1,69 @@
+"""OTX Detection perfomance tests."""
+
+# Copyright (C) 2023 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+
+import pytest
+
+from otx.cli.registry import Registry
+from .benchmark import OTXBenchmark
+
+
+MODEL_TEMPLATES = Registry(f"src/otx/algorithms").filter(task_type="DETECTION").templates
+MODEL_IDS = [template.model_template_id for template in MODEL_TEMPLATES]
+
+
+class TestPerfDetection:
+    """Benchmark basic object detection."""
+
+    BENCHMARK_CONFIGS = {
+        "small": {
+            "tags": {
+                "task": "detection",
+            },
+            "datasets": [
+                "detection/pothole_small/1",
+                "detection/pothole_small/2",
+                "detection/pothole_small/3",
+            ],
+            "num_repeat": 3,
+        },
+        "medium": {
+            "tags": {
+                "task": "detection",
+            },
+            "datasets": [
+                "detection/pothole_medium",
+            ],
+            "num_repeat": 3,
+        },
+        "large": {
+            "tags": {
+                "task": "detection",
+            },
+            "datasets": [
+                "detection/vitens_large",
+            ],
+            "num_repeat": 1,
+        },
+    }
+
+    @pytest.mark.parametrize("fxt_model_id", MODEL_TEMPLATES, ids=MODEL_IDS, indirect=True)
+    @pytest.mark.parametrize("fxt_benchmark", BENCHMARK_CONFIGS.items(), ids=BENCHMARK_CONFIGS.keys(), indirect=True)
+    def test_accuracy(self, fxt_model_id: str, fxt_benchmark: OTXBenchmark):
+        """Benchmark accruacy metrics."""
+        result = fxt_benchmark.run(
+            model_id=fxt_model_id,
+            tags={"benchmark": "accuracy"},
+        )
+
+    @pytest.mark.parametrize("fxt_model_id", MODEL_TEMPLATES, ids=MODEL_IDS, indirect=True)
+    @pytest.mark.parametrize("fxt_benchmark", BENCHMARK_CONFIGS.items(), ids=BENCHMARK_CONFIGS.keys(), indirect=True)
+    def test_speed(self, fxt_model_id: str, fxt_benchmark: OTXBenchmark):
+        """Benchmark train time per iter / infer time per image."""
+        fxt_benchmark.track_resources = True
+        result = fxt_benchmark.run(
+            model_id=fxt_model_id,
+            tags={"benchmark": "speed"},
+        )
diff --git a/tests/perf/test_instance_segmenatation.py b/tests/perf/test_instance_segmenatation.py
new file mode 100644
index 00000000000..6e4a1a9b275
--- /dev/null
+++ b/tests/perf/test_instance_segmenatation.py
@@ -0,0 +1,130 @@
+"""OTX Instance Segmentation perfomance tests."""
+
+# Copyright (C) 2023 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+
+import pytest
+
+from otx.cli.registry import Registry
+from .benchmark import OTXBenchmark
+
+
+MODEL_TEMPLATES = Registry(f"src/otx/algorithms").filter(task_type="INSTANCE_SEGMENTATION").templates
+MODEL_IDS = [template.model_template_id for template in MODEL_TEMPLATES]
+
+
+class TestPerfInstanceSegmentation:
+    """Benchmark basic instance segmentation."""
+
+    BENCHMARK_CONFIGS = {
+        "small": {
+            "tags": {
+                "task": "instance_segmentation",
+            },
+            "datasets": [
+                "instance_seg/wgisd_small/1",
+                "instance_seg/wgisd_small/2",
+                "instance_seg/wgisd_small/3",
+            ],
+            "num_repeat": 3,
+        },
+        "medium": {
+            "tags": {
+                "task": "instance_segmentation",
+            },
+            "datasets": [
+                "instance_seg/coco_car_person_medium",
+            ],
+            "num_repeat": 3,
+        },
+        "large": {
+            "tags": {
+                "task": "instance_segmentation",
+            },
+            "datasets": [
+                "instance_seg/bdd_large",
+            ],
+            "num_repeat": 1,
+        },
+    }
+
+    @pytest.mark.parametrize("fxt_model_id", MODEL_TEMPLATES, ids=MODEL_IDS, indirect=True)
+    @pytest.mark.parametrize("fxt_benchmark", BENCHMARK_CONFIGS.items(), ids=BENCHMARK_CONFIGS.keys(), indirect=True)
+    def test_accuracy(self, fxt_model_id: str, fxt_benchmark: OTXBenchmark):
+        """Benchmark accruacy metrics."""
+        result = fxt_benchmark.run(
+            model_id=fxt_model_id,
+            tags={"benchmark": "accuracy"},
+        )
+
+    @pytest.mark.parametrize("fxt_model_id", MODEL_TEMPLATES, ids=MODEL_IDS, indirect=True)
+    @pytest.mark.parametrize("fxt_benchmark", BENCHMARK_CONFIGS.items(), ids=BENCHMARK_CONFIGS.keys(), indirect=True)
+    def test_speed(self, fxt_model_id: str, fxt_benchmark: OTXBenchmark):
+        """Benchmark train time per iter / infer time per image."""
+        fxt_benchmark.track_resources = True
+        result = fxt_benchmark.run(
+            model_id=fxt_model_id,
+            tags={"benchmark": "speed"},
+        )
+
+
+class TestPerfTilingInstanceSegmentation:
+    """Benchmark tiling instance segmentation."""
+
+    TILING_PARAMS = {
+        "tiling_parameters.enable_tiling": 1,
+    }
+    BENCHMARK_CONFIGS = {
+        "small": {
+            "tags": {
+                "task": "tiling_instance_segmentation",
+            },
+            "datasets": [
+                "tiling_instance_seg/vitens_aeromonas_small/1",
+                "tiling_instance_seg/vitens_aeromonas_small/2",
+                "tiling_instance_seg/vitens_aeromonas_small/3",
+            ],
+            "num_repeat": 3,
+            "train_params": TILING_PARAMS,
+        },
+        "medium": {
+            "tags": {
+                "task": "tiling_instance_segmentation",
+            },
+            "datasets": [
+                "tiling_instance_seg/vitens_aeromonas_medium",
+            ],
+            "num_repeat": 3,
+            "train_params": TILING_PARAMS,
+        },
+        "large": {
+            "tags": {
+                "task": "tiling_instance_segmentation",
+            },
+            "datasets": [
+                "tiling_instance_seg/bdd_large",
+            ],
+            "num_repeat": 1,
+            "train_params": TILING_PARAMS,
+        },
+    }
+
+    @pytest.mark.parametrize("fxt_model_id", MODEL_TEMPLATES, ids=MODEL_IDS, indirect=True)
+    @pytest.mark.parametrize("fxt_benchmark", BENCHMARK_CONFIGS.items(), ids=BENCHMARK_CONFIGS.keys(), indirect=True)
+    def test_accuracy(self, fxt_model_id: str, fxt_benchmark: OTXBenchmark):
+        """Benchmark accruacy metrics."""
+        result = fxt_benchmark.run(
+            model_id=fxt_model_id,
+            tags={"benchmark": "accuracy"},
+        )
+
+    @pytest.mark.parametrize("fxt_model_id", MODEL_TEMPLATES, ids=MODEL_IDS, indirect=True)
+    @pytest.mark.parametrize("fxt_benchmark", BENCHMARK_CONFIGS.items(), ids=BENCHMARK_CONFIGS.keys(), indirect=True)
+    def test_speed(self, fxt_model_id: str, fxt_benchmark: OTXBenchmark):
+        """Benchmark train time per iter / infer time per image."""
+        fxt_benchmark.track_resources = True
+        result = fxt_benchmark.run(
+            model_id=fxt_model_id,
+            tags={"benchmark": "speed"},
+        )
diff --git a/tests/perf/test_semantic_segmentation.py b/tests/perf/test_semantic_segmentation.py
new file mode 100644
index 00000000000..a5ca4086f83
--- /dev/null
+++ b/tests/perf/test_semantic_segmentation.py
@@ -0,0 +1,72 @@
+"""OTX Semantic Segmentation perfomance tests."""
+
+# Copyright (C) 2023 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+
+import pytest
+
+from otx.cli.registry import Registry
+from .benchmark import OTXBenchmark
+
+
+MODEL_TEMPLATES = Registry(f"src/otx/algorithms").filter(task_type="SEGMENTATION").templates
+MODEL_IDS = [template.model_template_id for template in MODEL_TEMPLATES]
+
+
+class TestPerfSemanticSegmentation:
+    """Benchmark basic semantic segmentation."""
+
+    BENCHMARK_CONFIGS = {
+        "small": {
+            "tags": {
+                "task": "semantic_segmentation",
+            },
+            "datasets": [
+                "semantic_seg/kvasir_small/1",
+                "semantic_seg/kvasir_small/2",
+                "semantic_seg/kvasir_small/3",
+            ],
+            "subset_dir_names": {"train": "train", "val": "val", "test": "test"},
+            "num_repeat": 3,
+        },
+        "medium": {
+            "tags": {
+                "task": "semantic_segmentation",
+            },
+            "datasets": [
+                "semantic_seg/kvasir_medium",
+            ],
+            "subset_dir_names": {"train": "train", "val": "val", "test": "test"},
+            "num_repeat": 3,
+        },
+        "large": {
+            "tags": {
+                "task": "semantic_segmentation",
+            },
+            "datasets": [
+                "semantic_seg/kvasir_large",
+            ],
+            "subset_dir_names": {"train": "train", "val": "val", "test": "test"},
+            "num_repeat": 1,
+        },
+    }
+
+    @pytest.mark.parametrize("fxt_model_id", MODEL_TEMPLATES, ids=MODEL_IDS, indirect=True)
+    @pytest.mark.parametrize("fxt_benchmark", BENCHMARK_CONFIGS.items(), ids=BENCHMARK_CONFIGS.keys(), indirect=True)
+    def test_accuracy(self, fxt_model_id: str, fxt_benchmark: OTXBenchmark):
+        """Benchmark accruacy metrics."""
+        result = fxt_benchmark.run(
+            model_id=fxt_model_id,
+            tags={"benchmark": "accuracy"},
+        )
+
+    @pytest.mark.parametrize("fxt_model_id", MODEL_TEMPLATES, ids=MODEL_IDS, indirect=True)
+    @pytest.mark.parametrize("fxt_benchmark", BENCHMARK_CONFIGS.items(), ids=BENCHMARK_CONFIGS.keys(), indirect=True)
+    def test_speed(self, fxt_model_id: str, fxt_benchmark: OTXBenchmark):
+        """Benchmark train time per iter / infer time per image."""
+        fxt_benchmark.track_resources = True
+        result = fxt_benchmark.run(
+            model_id=fxt_model_id,
+            tags={"benchmark": "speed"},
+        )
diff --git a/tools/experiment.py b/tools/experiment.py
index 7a6de5ee568..f161a5a2372 100644
--- a/tools/experiment.py
+++ b/tools/experiment.py
@@ -790,7 +790,7 @@ def run_experiment_recipe(recipe_file: Union[str, Path], dryrun: bool = False):
     """
     exp_recipe = ExpRecipeParser(recipe_file)
     output_path = exp_recipe.output_path
-    output_path.mkdir(exist_ok=True)
+    output_path.mkdir(parents=True, exist_ok=True)
     current_dir = os.getcwd()
     os.chdir(output_path)