From cd3bb0b8f485eb1fd7afb25e06feb39823882e8f Mon Sep 17 00:00:00 2001
From: dimapihtar <dpihtar@gmail.com>
Date: Tue, 27 Aug 2024 06:08:08 -0700
Subject: [PATCH 01/63] add base configs

Signed-off-by: dimapihtar <dpihtar@gmail.com>
---
 .../llm/tools/auto_configurator/__init__.py   |   2 +
 .../base_configs/__init__.py                  |  20 +++
 .../auto_configurator/base_configs/basic.py   | 144 ++++++++++++++++++
 .../auto_configurator/base_configs/custom.py  |  42 +++++
 .../auto_configurator/base_configs/gemma.py   |  64 ++++++++
 .../auto_configurator/base_configs/gpt.py     |  62 ++++++++
 .../auto_configurator/base_configs/llama.py   |  65 ++++++++
 .../auto_configurator/base_configs/mistral.py |  62 ++++++++
 .../auto_configurator/base_configs/mixtral.py |  62 ++++++++
 9 files changed, 523 insertions(+)
 create mode 100644 nemo/collections/llm/tools/auto_configurator/__init__.py
 create mode 100644 nemo/collections/llm/tools/auto_configurator/base_configs/__init__.py
 create mode 100644 nemo/collections/llm/tools/auto_configurator/base_configs/basic.py
 create mode 100644 nemo/collections/llm/tools/auto_configurator/base_configs/custom.py
 create mode 100644 nemo/collections/llm/tools/auto_configurator/base_configs/gemma.py
 create mode 100644 nemo/collections/llm/tools/auto_configurator/base_configs/gpt.py
 create mode 100644 nemo/collections/llm/tools/auto_configurator/base_configs/llama.py
 create mode 100644 nemo/collections/llm/tools/auto_configurator/base_configs/mistral.py
 create mode 100644 nemo/collections/llm/tools/auto_configurator/base_configs/mixtral.py

diff --git a/nemo/collections/llm/tools/auto_configurator/__init__.py b/nemo/collections/llm/tools/auto_configurator/__init__.py
new file mode 100644
index 000000000000..ac4d7e216725
--- /dev/null
+++ b/nemo/collections/llm/tools/auto_configurator/__init__.py
@@ -0,0 +1,2 @@
+from nemo.collections.llm.tools.auto_configurator.core.calculate_performance import get_results
+from nemo.collections.llm.tools.auto_configurator.runner import AutoConfigurator
diff --git a/nemo/collections/llm/tools/auto_configurator/base_configs/__init__.py b/nemo/collections/llm/tools/auto_configurator/base_configs/__init__.py
new file mode 100644
index 000000000000..9aca9661c6a8
--- /dev/null
+++ b/nemo/collections/llm/tools/auto_configurator/base_configs/__init__.py
@@ -0,0 +1,20 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from nemo.collections.llm.tools.auto_configurator.base_configs.custom import custom
+from nemo.collections.llm.tools.auto_configurator.base_configs.gemma import Gemma
+from nemo.collections.llm.tools.auto_configurator.base_configs.gpt import GPT
+from nemo.collections.llm.tools.auto_configurator.base_configs.llama import Llama
+from nemo.collections.llm.tools.auto_configurator.base_configs.mistral import Mistral
+from nemo.collections.llm.tools.auto_configurator.base_configs.mixtral import Mixtral
diff --git a/nemo/collections/llm/tools/auto_configurator/base_configs/basic.py b/nemo/collections/llm/tools/auto_configurator/base_configs/basic.py
new file mode 100644
index 000000000000..616629a876f4
--- /dev/null
+++ b/nemo/collections/llm/tools/auto_configurator/base_configs/basic.py
@@ -0,0 +1,144 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+from megatron.core.optimizer import OptimizerConfig
+
+from nemo.collections.llm.utils import Config
+
+
+class Basic:
+    def __init__(
+        self,
+        name: str = None,
+        version: int = None,
+        size: int = None,
+        measure: str = "B",
+        cfg: dict = {},
+    ):
+        """
+        :param str name: model name.
+        :param int version: model version.
+        :param int size: model size.
+        :param str measure: meausre of model size. "M" if model size in millions, "B" if in billions.
+        :param dict cfg: auto configurator runner config.
+        """
+
+        self.name = name
+        self.version = version
+        self.size = size
+        self.measure = measure
+        self.cfg = cfg
+        self.num_nodes = cfg.get("num_nodes")
+        self.num_gpus = cfg.get("num_gpus")
+        self.max_steps = cfg.get("max_steps_per_run")
+        self.seq_length = cfg.get("seq_length")
+        self.global_batch_size = cfg.get("global_batch_size")
+        self.tokenizer_path = cfg.get("tokenizer_path")
+        self.data_paths = cfg.get("data_paths")
+        self.nemo_run = cfg.get("nemo_run")
+        self.max_minutes_per_run = cfg.get("max_minutes_per_run")
+
+    def model_config(self):
+        """Function that returns model config."""
+
+        None
+
+    def get_optim_config(self) -> OptimizerConfig:
+        """
+        Function that returns optimizer config.
+        :return: optim config.
+        :rtype: OptimizerConfig.
+        """
+        optim_params = {
+            "optimizer": "adam",
+            "lr": 1e-4,
+            "min_lr": 1e-5,
+            "use_distributed_optimizer": True,
+            "bf16": True,
+            "adam_beta1": 0.9,
+            "adam_beta2": 0.95,
+            "overlap_grad_reduce": False,
+            "overlap_param_gather": True,
+        }
+
+        if self.nemo_run:
+            optim_config = Config(
+                OptimizerConfig,
+                **optim_params,
+            )
+        else:
+            optim_config = OptimizerConfig(
+                **optim_params,
+            )
+
+        return optim_config
+
+    def get_trainer_config(self) -> dict:
+        """
+        Function that returns config for PTL trainer.
+        :return: trainer config.
+        :rtype: dict.
+        """
+
+        trainer_config = {
+            "accelerator": "gpu",
+            "enable_checkpointing": False,
+            "use_distributed_sampler": False,
+            "max_epochs": None,
+            "log_every_n_steps": 1,
+            "limit_val_batches": 1,
+            "limit_test_batches": 1,
+            "accumulate_grad_batches": 1,
+            "gradient_clip_val": 1.0,
+            "num_nodes": self.num_nodes,
+            "devices": self.num_gpus,
+            "max_steps": self.max_steps,
+            "val_check_interval": self.max_steps,
+        }
+
+        return trainer_config
+
+    def get_data_config(self) -> dict:
+        """
+        Function that returns dataset config.
+        :return: data config.
+        :rtype: dict.
+        """
+
+        data_config = {
+            "paths": self.data_paths,
+            "seq_length": self.seq_length,
+            "global_batch_size": self.global_batch_size,
+            "num_workers": 2,
+            # "split": "99990,8,2",
+            "index_mapping_dir": None,
+        }
+
+        return data_config
+
+    def get_run_config(self) -> dict:
+        """
+        Function that returns config for cluster job.
+        :return: cluster job config.
+        :rtype: dict.
+        """
+
+        run_config = {
+            "name": f"{self.name}_{self.size}{self.measure}",
+            "results_dir": None,
+            "time_limit": f"0-00:{self.max_minutes_per_run}:00",
+        }
+
+        return run_config
diff --git a/nemo/collections/llm/tools/auto_configurator/base_configs/custom.py b/nemo/collections/llm/tools/auto_configurator/base_configs/custom.py
new file mode 100644
index 000000000000..7f8a283fcaaa
--- /dev/null
+++ b/nemo/collections/llm/tools/auto_configurator/base_configs/custom.py
@@ -0,0 +1,42 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import copy
+import os
+
+from nemo.collections.llm.tools.auto_configurator import base_configs
+
+from .basic import Basic
+
+
+def custom(name, cfg):
+    """
+    Function taht return custom model class.
+    :param dict cfg: auto configurator runner config.
+    :return: Custom class object.
+    """
+    basic_class = getattr(base_configs, name)
+
+    class Custom(basic_class):
+        def __init__(self, name, cfg):
+            """
+            :param str name: model name.
+            :param dict cfg: auto configurator runner config.
+            """
+
+            super().__init__(name=name, cfg=cfg)
+
+    custom_class = Custom(name, cfg)
+
+    return custom_class
diff --git a/nemo/collections/llm/tools/auto_configurator/base_configs/gemma.py b/nemo/collections/llm/tools/auto_configurator/base_configs/gemma.py
new file mode 100644
index 000000000000..d4a2665adf56
--- /dev/null
+++ b/nemo/collections/llm/tools/auto_configurator/base_configs/gemma.py
@@ -0,0 +1,64 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import copy
+import os
+import torch
+
+from nemo.collections import llm
+from nemo.collections.llm.utils import Config
+
+from .basic import Basic
+
+
+class Gemma(Basic):
+    def __init__(
+        self,
+        name: str = "Gemma",
+        version: int = None,
+        size: int = 2,
+        measure: str = "B",
+        cfg: dict = {},
+    ):
+        """
+        :param str name: model name.
+        :param int version: model version.
+        :param int size: model size.
+        :param str measure: meausre of model size. "M" if model size in millions, "B" if in billions.
+        :param dict cfg: auto configurator runner config.
+        """
+
+        super().__init__(name=name, version=version, size=size, measure=measure, cfg=cfg)
+        self.config_name = f"{self.name}Config{self.size}{self.measure}"
+
+    def get_model_config(self) -> Config:
+        """
+        Function that returns model config.
+        :return: model config.
+        :rtype: Config.
+        """
+
+        model_class = getattr(llm, self.config_name)
+        kwargs = self.cfg.get("model_args", {})
+
+        if self.nemo_run:
+            model_config = Config(model_class, **kwargs)
+        else:
+            model_config = model_class(**kwargs)
+
+        model_config.global_batch_size = self.global_batch_size
+        model_config.seq_length = self.seq_length
+        model_config.pipeline_dtype = torch.bfloat16
+
+        return model_config
diff --git a/nemo/collections/llm/tools/auto_configurator/base_configs/gpt.py b/nemo/collections/llm/tools/auto_configurator/base_configs/gpt.py
new file mode 100644
index 000000000000..ef4082ac9c87
--- /dev/null
+++ b/nemo/collections/llm/tools/auto_configurator/base_configs/gpt.py
@@ -0,0 +1,62 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import copy
+import os
+
+from nemo.collections import llm
+from nemo.collections.llm.utils import Config
+
+from .basic import Basic
+
+
+class GPT(Basic):
+    def __init__(
+        self,
+        name: str = "GPT",
+        version: int = 3,
+        size: int = 5,
+        measure: str = "B",
+        cfg: dict = {},
+    ):
+        """
+        :param str name: model name.
+        :param int version: model version.
+        :param int size: model size.
+        :param str measure: meausre of model size. "M" if model size in millions, "B" if in billions.
+        :param dict cfg: auto configurator runner config.
+        """
+
+        super().__init__(name=name, version=version, size=size, measure=measure, cfg=cfg)
+        self.config_name = f"{self.name}Config{self.size}{self.measure}"
+
+    def get_model_config(self) -> Config:
+        """
+        Function that returns model config.
+        :return: model config.
+        :rtype: Config.
+        """
+
+        model_class = getattr(llm, self.config_name)
+        kwargs = self.cfg.get("model_args", {})
+
+        if self.nemo_run:
+            model_config = Config(model_class, **kwargs)
+        else:
+            model_config = model_class(**kwargs)
+
+        model_config.global_batch_size = self.global_batch_size
+        model_config.seq_length = self.seq_length
+
+        return model_config
diff --git a/nemo/collections/llm/tools/auto_configurator/base_configs/llama.py b/nemo/collections/llm/tools/auto_configurator/base_configs/llama.py
new file mode 100644
index 000000000000..3676749be653
--- /dev/null
+++ b/nemo/collections/llm/tools/auto_configurator/base_configs/llama.py
@@ -0,0 +1,65 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import copy
+import os
+import torch
+
+from nemo.collections import llm
+from nemo.collections.llm.utils import Config
+
+from .basic import Basic
+
+
+class Llama(Basic):
+    def __init__(
+        self,
+        name: str = "Llama",
+        version: int = 2,
+        size: int = 7,
+        measure: str = "B",
+        cfg: dict = {},
+    ):
+        """
+        :param str name: model name.
+        :param int version: model version.
+        :param int size: model size.
+        :param str measure: meausre of model size. "M" if model size in millions, "B" if in billions.
+        :param dict cfg: auto configurator runner config.
+        """
+
+        super().__init__(name=name, version=version, size=size, measure=measure, cfg=cfg)
+        self.config_name = f"{self.name}{self.version}Config{self.size}{self.measure}"
+
+    def get_model_config(self) -> Config:
+        """
+        Function that returns model config.
+        :return: model config.
+        :rtype: Config.
+        """
+
+        model_class = getattr(llm, self.config_name)
+        kwargs = self.cfg.get("model_args", {})
+
+        if self.nemo_run:
+            model_config = Config(model_class, **kwargs)
+        else:
+            model_config = model_class(**kwargs)
+
+        model_config.global_batch_size = self.global_batch_size
+        print(self.global_batch_size)
+        model_config.seq_length = self.seq_length
+        model_config.pipeline_dtype = torch.bfloat16
+
+        return model_config
diff --git a/nemo/collections/llm/tools/auto_configurator/base_configs/mistral.py b/nemo/collections/llm/tools/auto_configurator/base_configs/mistral.py
new file mode 100644
index 000000000000..830a754c8730
--- /dev/null
+++ b/nemo/collections/llm/tools/auto_configurator/base_configs/mistral.py
@@ -0,0 +1,62 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import copy
+import os
+
+from nemo.collections import llm
+from nemo.collections.llm.utils import Config
+
+from .basic import Basic
+
+
+class Mistral(Basic):
+    def __init__(
+        self,
+        name: str = "Mistral",
+        version: int = None,
+        size: int = 7,
+        measure: str = "B",
+        cfg: dict = {},
+    ):
+        """
+        :param str name: model name.
+        :param int version: model version.
+        :param int size: model size.
+        :param str measure: meausre of model size. "M" if model size in millions, "B" if in billions.
+        :param dict cfg: auto configurator runner config.
+        """
+
+        super().__init__(name=name, version=version, size=size, measure=measure, cfg=cfg)
+        self.config_name = f"{self.name}Config{self.size}{self.measure}"
+
+    def get_model_config(self) -> Config:
+        """
+        Function that returns model config.
+        :return: model config.
+        :rtype: Config.
+        """
+
+        model_class = getattr(llm, self.config_name)
+        kwargs = self.cfg.get("model_args", {})
+
+        if self.nemo_run:
+            model_config = Config(model_class, **kwargs)
+        else:
+            model_config = model_class(**kwargs)
+
+        model_config.global_batch_size = self.global_batch_size
+        model_config.seq_length = self.seq_length
+
+        return model_config
diff --git a/nemo/collections/llm/tools/auto_configurator/base_configs/mixtral.py b/nemo/collections/llm/tools/auto_configurator/base_configs/mixtral.py
new file mode 100644
index 000000000000..ae08e9ef5f05
--- /dev/null
+++ b/nemo/collections/llm/tools/auto_configurator/base_configs/mixtral.py
@@ -0,0 +1,62 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import copy
+import os
+
+from nemo.collections import llm
+from nemo.collections.llm.utils import Config
+
+from .basic import Basic
+
+
+class Mixtral(Basic):
+    def __init__(
+        self,
+        name: str = "Mixtral",
+        version: int = 8,
+        size: int = 7,
+        measure: str = "B",
+        cfg: dict = {},
+    ):
+        """
+        :param str name: model name.
+        :param int version: model version.
+        :param int size: model size.
+        :param str measure: meausre of model size. "M" if model size in millions, "B" if in billions.
+        :param dict cfg: auto configurator runner config.
+        """
+
+        super().__init__(name=name, version=version, size=size, measure=measure, cfg=cfg)
+        self.config_name = f"{self.name}Config{self.version}x{self.size}{self.measure}"
+
+    def get_model_config(self) -> Config:
+        """
+        Function that returns model config.
+        :return: model config.
+        :rtype: Config.
+        """
+
+        model_class = getattr(llm, self.config_name)
+        kwargs = self.cfg.get("model_args", {})
+
+        if self.nemo_run:
+            model_config = Config(model_class, **kwargs)
+        else:
+            model_config = model_class(**kwargs)
+
+        model_config.global_batch_size = self.global_batch_size
+        model_config.seq_length = self.seq_length
+
+        return model_config

From 28d3c026886f2ae6c260d349be4e0a8aff9f30f6 Mon Sep 17 00:00:00 2001
From: dimapihtar <dpihtar@gmail.com>
Date: Tue, 27 Aug 2024 06:22:12 -0700
Subject: [PATCH 02/63] add auto configurator functionality

Signed-off-by: dimapihtar <dpihtar@gmail.com>
---
 .../tools/auto_configurator/core/__init__.py  |   13 +
 .../auto_configurator/core/base_config.py     |  180 +++
 .../core/calculate_performance.py             |  351 ++++++
 .../auto_configurator/core/search_config.py   |   84 ++
 .../auto_configurator/core/training_config.py | 1092 +++++++++++++++++
 .../llm/tools/auto_configurator/core/utils.py |  529 ++++++++
 6 files changed, 2249 insertions(+)
 create mode 100644 nemo/collections/llm/tools/auto_configurator/core/__init__.py
 create mode 100644 nemo/collections/llm/tools/auto_configurator/core/base_config.py
 create mode 100644 nemo/collections/llm/tools/auto_configurator/core/calculate_performance.py
 create mode 100644 nemo/collections/llm/tools/auto_configurator/core/search_config.py
 create mode 100644 nemo/collections/llm/tools/auto_configurator/core/training_config.py
 create mode 100644 nemo/collections/llm/tools/auto_configurator/core/utils.py

diff --git a/nemo/collections/llm/tools/auto_configurator/core/__init__.py b/nemo/collections/llm/tools/auto_configurator/core/__init__.py
new file mode 100644
index 000000000000..d9155f923f18
--- /dev/null
+++ b/nemo/collections/llm/tools/auto_configurator/core/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/nemo/collections/llm/tools/auto_configurator/core/base_config.py b/nemo/collections/llm/tools/auto_configurator/core/base_config.py
new file mode 100644
index 000000000000..1659cdfd2ecb
--- /dev/null
+++ b/nemo/collections/llm/tools/auto_configurator/core/base_config.py
@@ -0,0 +1,180 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Generates base configuration for given model."""
+
+import math
+import os
+from typing import Tuple
+
+from nemo.collections.llm.tools.auto_configurator.core.utils import generic_base_config
+
+
+def calculate_model_size(
+    gpu_count: int,
+    max_training_days: float,
+    model_size_in_b: float = None,
+    tflops_per_gpu: int = 140,
+    num_tokens_in_b: int = 300,
+    model_name: str = "gpt3",
+) -> float:
+    """
+    Estimates a model size to be trained given the constraints. If the
+    model_size is provided, it estimates the time to train it with the given
+    constraints.
+    Example: output 5B params to train for 7 days with 160 GPUs.
+    :param int gpu_count: number of gpus to use (num_nodes * gpus_per_node).
+    :param float max_training_days: number of days to train the model for.
+    :param float model_size_in_b: number of parameters in the model, if known.
+    :param int tflops_per_gpu: estimated number of TFLOPS/s per GPU.
+    :param int num_tokens_in_b: number of tokens to train the model for.
+    :return: number of parameters to use for training.
+    :rtype: float
+    """
+    # Model size is not known, must be estimated.
+    if model_size_in_b is None:
+        model_size_in_b = _estimate_model_size(
+            max_training_days=max_training_days,
+            gpu_count=gpu_count,
+            tflops_per_gpu=tflops_per_gpu,
+            num_tokens_in_b=num_tokens_in_b,
+            model_name=model_name,
+        )
+    # Model size is known, so only time to train estimate is needed.
+    else:
+        max_training_days = _estimate_training_time(
+            model_size_in_b=model_size_in_b,
+            gpu_count=gpu_count,
+            tflops_per_gpu=tflops_per_gpu,
+            num_tokens_in_b=num_tokens_in_b,
+            model_name=model_name,
+        )
+
+    print(
+        f"You can train a {model_size_in_b}B parameter model in "
+        f"{max_training_days} days using {gpu_count} GPUs. This result assumes "
+        f"you are training to {num_tokens_in_b}B tokens, and each GPU achieves "
+        f"{tflops_per_gpu} TFLOPS."
+    )
+    return model_size_in_b
+
+
+def _estimate_model_size(
+    max_training_days: float,
+    gpu_count: int,
+    tflops_per_gpu: int,
+    num_tokens_in_b: int,
+    model_name: str,
+) -> float:
+    """
+    Estimates model size given time and hardware constraints. It's only used if the model size is
+    not provided by the user.
+    :param float max_training_days: number of days to train the model for.
+    :param int gpu_count: number of gpus to use (num_nodes * gpus_per_node).
+    :param int tflops_per_gpu: estimated number of TFLOPS/s per GPU.
+    :param int num_tokens_in_b: number of tokens to train the model for.
+    :param str model_name: name of the model, such as gpt3, t5, mt5...
+    :return: number of parameters to use for training.
+    :rtype: float
+    :raises NotImplementedError: if the model_name is not one of the supported models.
+    """
+    model_penalty = 0.87 if model_name == "mt5" else 1.0
+    valid_models = ["gpt3", "t5", "mt5", "bert", "llama", "mixtral", "mistral", "gemma"]
+    try:
+        if model_name in valid_models:
+            return round(
+                model_penalty
+                * (max_training_days * 3600 * 24 * gpu_count * tflops_per_gpu * 1e12)
+                / (8 * num_tokens_in_b * 1e9)
+                / 1e9,
+                2,
+            )
+        else:
+            raise NotImplementedError
+    except ValueError as err:
+        print(f"Input values were not valid: {err}")
+    except ZeroDivisionError as err:
+        print(f"Cannot divide by zero. This can happen if num_tokens_in_b is zero: {err}")
+    except NotImplementedError as err:
+        print(f"Model size estimation is only available for {valid_models}: {err}")
+    return None
+
+
+def _estimate_training_time(
+    model_size_in_b: float,
+    gpu_count: int,
+    tflops_per_gpu: int,
+    num_tokens_in_b: int,
+    model_name: str,
+) -> float:
+    """
+    Estimates training time for a given model size and hardware constraint. To be used when
+    a model size is provided by the user.
+    :param float model_size_in_b: number of parameters to use for training.
+    :param int gpu_count: number of gpus to use (num_nodes * gpus_per_node).
+    :param int tflops_per_gpu: estimated number of TFLOPS/s per GPU.
+    :param int num_tokens_in_b: number of tokens to train the model for.
+    :param str model_name: name of the model, such as gpt3, t5, mt5...
+    :return: number of days it will take to train the model.
+    :rtype: float
+    :raises NotImplementedError: if the model_name is not one of the supported models.
+    """
+    model_penalty = 1.15 if model_name == "mt5" else 1.0
+    valid_models = ["gpt3", "t5", "mt5", "bert", "llama", "mixtral", "mistral", "gemma"]
+    try:
+        if model_name in valid_models:
+            return round(
+                model_penalty
+                * (model_size_in_b * 1e9 * 8 * num_tokens_in_b * 1e9)
+                / (3600 * 24 * gpu_count * tflops_per_gpu * 1e12),
+                2,
+            )
+        else:
+            raise NotImplementedError
+    except ValueError as err:
+        print(f"Input values were not valid: {err}")
+    except ZeroDivisionError as err:
+        print(f"Cannot divide by zero. This can happen if gpu_count or tflops_per_gpu are zero: {err}")
+    except NotImplementedError as err:
+        print(f"Training time estimation is only available for {valid_models}: {err}")
+    return None
+
+
+def generate_base_config(
+    model_name: str,
+    model_version: int,
+    model_size_in_b: int,
+    model_measure: str,
+    cfg: dict,
+):
+    """
+    Generates base config dictionary for a given model name and size.
+    :param str model_name: name of the model, such as gpt3, t5, mt5...
+    :param int model_version: version of model to be trained.
+    :param float model_size_in_b: number of parameters in the model, if known.
+    :param str model_measure: measure of model size (millions or billions).
+    :param dict cfg: full config object.
+    :return: base config object for the given model.
+    :rtype: dict
+    """
+
+    base_cfg = generic_base_config(
+        model_name=model_name,
+        model_version=model_version,
+        model_size_in_b=model_size_in_b,
+        model_measure=model_measure,
+        cfg=cfg,
+    )
+
+    return base_cfg
diff --git a/nemo/collections/llm/tools/auto_configurator/core/calculate_performance.py b/nemo/collections/llm/tools/auto_configurator/core/calculate_performance.py
new file mode 100644
index 000000000000..d4d9c84dd204
--- /dev/null
+++ b/nemo/collections/llm/tools/auto_configurator/core/calculate_performance.py
@@ -0,0 +1,351 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import csv
+import os
+import re
+from typing import Optional
+
+import pandas as pd
+from tensorboard.backend.event_processing import event_accumulator
+
+from nemo.collections.llm.tools.auto_configurator.core.utils import generic_base_config
+
+
+def get_results(
+    training_logs: str = None,
+    path_to_save: str = None,
+    model_name: str = None,
+    num_nodes: int = None,
+    model_version: int = None,
+    seq_length: int = None,
+    global_batch_size: int = None,
+    vocab_size: int = None,
+    model_size: Optional[int] = None,
+    model_measure: Optional[str] = "B",
+    gpus_per_node: Optional[int] = 8,
+    max_training_days: Optional[int] = 2,
+    tflops_per_gpu: Optional[int] = 140,
+    num_tokens_in_b: Optional[int] = 300,
+    custom_model: Optional[bool] = False,
+    output_top_n: Optional[int] = 10,
+):
+    """
+    :param str training_logs: path to the dicrectory with training logs.
+    :param str path_to_save: path where to save performance results.
+    :param str model_name: model name used for auto conf search.
+    :param int num_nodes: number of nodes used for auto conf search.
+    :param int model_version: version of model. 3 for GPT3, 2 for Llama2.
+    :param int seq_length: model sequence length.
+    :param int global_batch_size: model global batch size.
+    :param int vocab_size: size of tokenizer vocabulary.
+    :param Optional[int] model_size: size of model used for auto conf search.
+    :param Optional[str] model_measure: "M" if model_size is specified in millions. "B" if in billions.
+    :param Optional[int] gpus_per_node: number of GPUs per node used for auto conf search.
+    :param Optional[int] max_training_days: number of days expected model to be trained.
+    :param Optional[int] tflops_per_gpu: estimated tflops per GPU.
+    :param Optional[int] num_tokens_in_b: number of tokens in billions in train dataset.
+    :param Optional[bool] custom_model: set to True if custom model was used.
+    :param Optional[int] output_top_n: Number of configs to be printed out as best configs.
+    """
+    # Get model architecture
+    cfg = locals()
+    cfg["gpu_count"] = num_nodes * gpus_per_node
+    base_cfg, _ = generic_base_config(
+        model_name=model_name,
+        model_version=model_version,
+        model_size_in_b=model_size,
+        model_measure=model_measure,
+        cfg=cfg,
+    )
+
+    layers = base_cfg["model"].num_layers
+    hs = base_cfg["model"].hidden_size
+    ffn_hs = base_cfg["model"].ffn_hidden_size
+
+    training_logs = training_logs
+    final_result_logs = path_to_save
+
+    result_columns = [
+        "Model Name",
+        "Model Size",
+        "Seq Length",
+        "TP",
+        "PP",
+        "CP",
+        "EP",
+        "MBS",
+        "Act Ckpt Layers",
+        "Act Ckpt Micro Bathes",
+        "Act Ckpt Layers per Pipeline",
+        "Num Layers",
+        "Hidden Size",
+        "FFN Hidden Size",
+        "GBS",
+        "Nodes",
+        "GPUs per Node",
+        "Time per Step",
+        "Samples per Second",
+        "Model TFLOPS / GPU",
+        "Model TFLOPS Aggregate",
+        "Config Name",
+    ]
+    error_columns = [
+        "Model Name",
+        "Model Size",
+        "Seq Length",
+        "TP",
+        "PP",
+        "CP",
+        "EP",
+        "MBS",
+        "Act Ckpt Layers",
+        "Act Ckpt Micro Bathes",
+        "Act Ckpt Layers per Pipeline",
+        "Num Layers",
+        "Hidden Size",
+        "FFN Hidden Size",
+        "GBS",
+        "Nodes",
+        "GPUs per Node",
+        "Error Message",
+    ]
+    result = []
+    errors = []
+    dirs = os.listdir(training_logs)
+    if ".sdk" in dirs:
+        dirs.pop(0)
+
+    for candidate_dir in dirs:
+        logs_dir = os.path.join(training_logs, candidate_dir)
+        logs_folder = [f.path for f in os.scandir(logs_dir) if f.is_dir()][0]
+        tp, pp, cp, ep, mbs, act_ckpt, num_mbs_act, act_per_pipe = get_config(candidate_dir)
+
+        for f in os.listdir(logs_folder):
+            if f.endswith("0.txt"):
+                error_file = os.path.join(logs_folder, f)
+                error = find_error(error_file)
+                if error:
+                    errors.append(
+                        [
+                            model_name,
+                            model_size,
+                            seq_length,
+                            tp,
+                            pp,
+                            cp,
+                            ep,
+                            mbs,
+                            act_ckpt,
+                            num_mbs_act,
+                            act_per_pipe,
+                            layers,
+                            hs,
+                            ffn_hs,
+                            global_batch_size,
+                            num_nodes,
+                            gpus_per_node,
+                            error,
+                        ]
+                    )
+
+        files = os.listdir(logs_folder)
+        for f in files:
+            if f.startswith("events"):
+                event_file = os.path.join(logs_folder, f)
+                ea = event_accumulator.EventAccumulator(event_file)
+                ea.Reload()
+                try:
+                    timing_list = ea.Scalars("train_step_timing in s")
+                    if len(timing_list) <= 6:
+                        continue
+                    timing_list = [x.value for x in timing_list[5:]]
+                    avg_global_step_time = round(sum(timing_list) / len(timing_list), 4)
+                    samples_per_s = round(global_batch_size / avg_global_step_time, 2)
+                    m_tflops, m_tflops_gpu = calculate_tflops(
+                        model_name=model_name,
+                        gbs=global_batch_size,
+                        enc_seq_len=seq_length,
+                        dec_seq_len=seq_length,
+                        hs=hs,
+                        ffn_hs=ffn_hs,
+                        layers=layers,
+                        vocab=vocab_size,
+                        nodes=num_nodes,
+                        gpus_per_node=gpus_per_node,
+                        time_per_step=avg_global_step_time,
+                    )
+                    config_name = f"tp{tp}_pp{pp}_cp{cp}_ep{ep}_mbs{mbs}_act_{act_ckpt}_num_mbs_act_{num_mbs_act}_act_per_pipe_{act_per_pipe}"
+                    result.append(
+                        [
+                            model_name,
+                            model_size,
+                            seq_length,
+                            tp,
+                            pp,
+                            cp,
+                            ep,
+                            mbs,
+                            act_ckpt,
+                            num_mbs_act,
+                            act_per_pipe,
+                            layers,
+                            hs,
+                            ffn_hs,
+                            global_batch_size,
+                            num_nodes,
+                            gpus_per_node,
+                            avg_global_step_time,
+                            samples_per_s,
+                            m_tflops_gpu,
+                            m_tflops,
+                            config_name,
+                        ]
+                    )
+                finally:
+                    continue
+    result.sort(key=lambda x: x[17])
+    print(f"Top {min(output_top_n, len(result))} configs sorted from fastest to slowest:")
+    for i, res in enumerate(result):
+        print(f"Config #{i+1}: {res[-1]} with {res[17]:.4f}s per global step.")
+        if i + 1 == output_top_n:
+            break
+
+    top_config = f"{model_name}_{model_size}b_{num_nodes}nodes_tp_{result[0][3]}_pp_{result[0][4]}_cp_{result[0][5]}_ep_{result[0][6]}_mbs_{result[0][7]}_act_ckpt_{result[0][8]}_num_mbs_act_{result[0][9]}_act_per_pipe_{result[0][10]}"
+    print("\n==================================================")
+    print(f"Optimal config: {top_config} with {result[0][17]:.4f}s per global step.")
+    print("==================================================\n")
+
+    # Save results as a CSV file.
+    os.makedirs(final_result_logs, exist_ok=True)
+    result_df = pd.DataFrame(result, columns=result_columns)
+    result_df.to_csv(os.path.join(final_result_logs, f"final_summary_{num_nodes}nodes.csv"), index=False)
+
+    error_df = pd.DataFrame(errors, columns=error_columns)
+    error_df.to_csv(os.path.join(final_result_logs, f"failed_jobs_{num_nodes}nodes.csv"), index=False)
+
+
+def calculate_tflops(
+    model_name,
+    gbs,
+    enc_seq_len,
+    dec_seq_len,
+    hs,
+    ffn_hs,
+    layers,
+    vocab,
+    nodes,
+    gpus_per_node,
+    time_per_step,
+):
+    """Calculates model and hardware TFLOPS for each model.
+    GPT-3 Formulas:
+        Model FLOPs = (24𝐵𝑠ℎ^2 + 4𝐵��^2ℎ) x (3 x num_layers) + 6𝐵𝑠ℎ
+    T5/mT5 Formula:
+        Model FLOPs =
+    Bert Formula:
+        Model FLOPs = 72BLsh^2 * ( 1 + (s/6h) + (v/12hL))
+    """
+    if model_name in ["gpt3", "llama", "baichuan2", "chatglm", "qwen2", "mixtral"]:
+        # Model FLOPS calculation
+        model_flops = (
+            (24 * gbs * enc_seq_len * hs * hs + 4 * gbs * enc_seq_len * enc_seq_len * hs) * (3 * layers)
+            + (6 * gbs * enc_seq_len * hs * vocab)
+        ) / time_per_step
+        model_flops_per_gpu = model_flops / (nodes * gpus_per_node)
+
+        model_tflops = model_flops / 1e12
+        model_tflops_per_gpu = model_flops_per_gpu / 1e12
+
+    elif model_name == "bert":
+        model_flops = (
+            72 * gbs * layers * enc_seq_len * hs * hs * (1 + (enc_seq_len / (6 * hs)) + (vocab / (12 * hs * layers)))
+        ) / time_per_step
+        model_flops_per_gpu = model_flops / (nodes * gpus_per_node)
+        model_tflops = model_flops / 1e12
+        model_tflops_per_gpu = model_flops_per_gpu / 1e12
+
+    elif model_name in ["t5", "mt5"]:
+        # Encoder Layer FLOPS: include self attention + MLP
+        flops_self_attn_enc = 8 * gbs * enc_seq_len * hs * hs + 4 * gbs * enc_seq_len * enc_seq_len * hs
+        flops_mlp_enc = 6 * gbs * enc_seq_len * hs * ffn_hs  # geglu needs two gemms for h -> ffn_h
+        flops_enc_layer = flops_self_attn_enc + flops_mlp_enc
+
+        # Decoder Layer FLOPS: inlcude self_attn + cross_attn + MLP
+        flops_self_attn_dec = 8 * gbs * dec_seq_len * hs * hs + 4 * gbs * dec_seq_len * dec_seq_len * hs
+        flops_cross_attn_dec = (
+            4 * gbs * enc_seq_len * hs * hs
+            + 4 * gbs * dec_seq_len * hs * hs
+            + 4 * gbs * enc_seq_len * dec_seq_len * hs
+        )
+        flops_mlp_dec = 6 * gbs * dec_seq_len * hs * ffn_hs  # geglu needs two gemms for h -> ffn_h
+        flops_dec_layer = flops_self_attn_dec + flops_cross_attn_dec + flops_mlp_dec
+
+        # FLOPs of logits layer in the head
+        flops_logits = 2 * gbs * dec_seq_len * hs * vocab
+
+        # FLOPs of fprop
+        flops_fprop = (flops_enc_layer + flops_dec_layer) * (layers // 2) + flops_logits
+
+        # FLOPs of each train step (FLOPs of bprop is 2*fprop)
+        model_flops = 3 * flops_fprop / time_per_step
+        model_flops_per_gpu = model_flops / (nodes * gpus_per_node)
+        model_tflops = model_flops / 1e12
+        model_tflops_per_gpu = model_flops_per_gpu / 1e12
+
+    else:
+        raise NotImplementedError("Model type not supported.")
+    return round(model_tflops, 2), round(model_tflops_per_gpu, 2)
+
+
+def find_error(error_file: str, errors: list = ["CUDA out of memory"]):
+    """
+    Finds the error among job output.
+    :param list errors: list of "popular" errors.
+    :param str error_file: path to the job output.
+    :return: str error if job has been failed because of one of listed errors and None if not.
+    :rtype: str
+    """
+    error = None
+    with open(error_file, "r") as f:
+        output = f.read()
+    for e in errors:
+        if e in output:
+            error = e
+    return error
+
+
+def get_config(run_name: str):
+    pattern = r'_(tp|pp|cp|ep|mbs|act_ckpt|num_mbs_act|act_per_pipe)_([^_]+)'
+
+    # Find all matches in the input string
+    matches = re.findall(pattern, run_name)
+
+    # Convert matches to a dictionary
+    params = {param: value for param, value in matches}
+
+    return (
+        params["tp"],
+        params["pp"],
+        params["cp"],
+        params["ep"],
+        params["mbs"],
+        params["act_ckpt"],
+        params["num_mbs_act"],
+        params["act_per_pipe"],
+    )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/nemo/collections/llm/tools/auto_configurator/core/search_config.py b/nemo/collections/llm/tools/auto_configurator/core/search_config.py
new file mode 100644
index 000000000000..03ea6ca8c74c
--- /dev/null
+++ b/nemo/collections/llm/tools/auto_configurator/core/search_config.py
@@ -0,0 +1,84 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+from typing import Optional
+
+from nemo.collections.llm.tools.auto_configurator.core.training_config import search_training_config
+from nemo.collections.llm.tools.auto_configurator.core.utils import generic_base_config
+
+SUPPORTED_MODELS = [
+    "gpt3",
+    "llama",
+    "mixtral",
+    "mistral",
+    "gemma",
+]
+
+
+def search_configs(cfg: dict):
+    """
+    Main function that implements the entire pipeline to search the optimal
+    model config and launch the grid searches for both training and inference
+    constraints.
+    :param dict cfg: main hydra config object for the auto configurator.
+    :return: dictionary of generated configs.
+    :rtype: dict
+    """
+
+    # Read config
+    num_nodes = cfg.get("num_nodes")
+    gpus_per_node = cfg.get("gpus_per_node", 8)
+    gpu_memory_gb = cfg.get("gpu_memory_gb", 80)
+    max_training_days = cfg.get("max_training_days", 2)
+    max_minutes_per_run = cfg.get("max_minutes_per_run", 30)
+    model_name = cfg.get("model_type")
+    model_version = cfg.get("model_version")
+    model_size_in_b = cfg.get("model_size")
+    model_measure = cfg.get("model_measure", "B")
+    vocab_size = cfg.get("vocab_size", 32000)
+    tflops_per_gpu = cfg.get("tflops_per_gpu", 140)
+    num_tokens_in_b = cfg.get("num_tokens_in_b", 300)
+    seq_length = cfg.get("seq_length", 2048)
+    global_batch_size = cfg.get("global_batch_size")
+
+    assert model_name in SUPPORTED_MODELS, f"model must be set to one of {SUPPORTED_MODELS}"
+
+    gpu_count = num_nodes * gpus_per_node
+    assert isinstance(gpu_count, int) and gpu_count > 0, "num_nodes * gpus_per_node must be an int larger than zero."
+    assert isinstance(gpu_memory_gb, int) and gpu_memory_gb in (
+        40,
+        80,
+    ), "gpu_memory_gb can only be 40 or 80."
+    assert (
+        isinstance(max_minutes_per_run, int) and max_minutes_per_run >= 10
+    ), "max_minutes_per_run must be an int and be at least 10 minutes."
+
+    cfg["model_size_in_b"] = model_size_in_b
+    cfg["gpu_count"] = gpu_count
+    cfg["num_gpus"] = gpus_per_node
+
+    # Generate base config for the given model size
+    base_cfg, train_cfg = generic_base_config(
+        model_name=model_name,
+        model_version=model_version,
+        model_size_in_b=model_size_in_b,
+        model_measure=model_measure,
+        cfg=cfg,
+    )
+
+    # Launch grid search for training constraints
+    configs = search_training_config(base_cfg, train_cfg)
+
+    return configs
diff --git a/nemo/collections/llm/tools/auto_configurator/core/training_config.py b/nemo/collections/llm/tools/auto_configurator/core/training_config.py
new file mode 100644
index 000000000000..412766cc9f2c
--- /dev/null
+++ b/nemo/collections/llm/tools/auto_configurator/core/training_config.py
@@ -0,0 +1,1092 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Generates training configs."""
+
+import os
+import shutil
+import subprocess
+from dataclasses import dataclass, field
+from typing import List, Tuple
+
+from nemo.collections.llm.tools.auto_configurator.core import utils
+
+
+def search_training_config(
+        base_cfg: dict,
+    train_cfg: dict,
+) -> None:
+    """
+    Entry point for the Auto Configurator search. This function calls other functions
+    to generate the grid of possible configurations.
+    :param dict base_cfg: base configuration of the model to be trained.
+    :param dict base_cfg: config of the model that will be launched.
+    :return: dict with generated configs.
+    """
+    # Generate candidate configs.
+    configs = generate_grid_search_configs(base_cfg, train_cfg)
+
+    return configs
+
+
+def generate_grid_search_configs(
+        base_cfg: dict,
+    train_cfg: dict,
+) -> Tuple[str, List[int], int]:
+    """
+    Generates the grid of all possible configurations for the given model, and stores
+    each different configuration in a yaml file.
+    :param dict base_cfg: base configuration of the model to be trained.
+    :param dict base_cfg: train configuration of the model to be trained.
+    :return: dict with generated configs.
+    """
+
+    model_name = train_cfg.get("model_type")
+    model_version = train_cfg.get("model_version")
+    model_size_in_b = train_cfg.get("model_size_in_b")
+    model_measure = train_cfg.get("model_measure")
+
+    # 2 * num_layers is needed because of encoder/decoder architecture.
+    multiplier = (
+        1
+        if model_name in ["gpt3", "bert", "llama", "baichuan2", "chatglm", "qwen2", "mixtral", "mistral", "gemma"]
+        else 2
+    )
+
+    seq_length = base_cfg["model"].seq_length
+    num_layers = (
+        base_cfg["model"].num_layers
+        if model_name in ["gpt3", "bert", "llama", "baichuan2", "chatglm", "qwen2", "mixtral", "mistral", "gemma"]
+        else base_cfg["model"].encoder.num_layers
+    )
+
+    if model_name in [
+        "gpt3",
+        "bert",
+        "llama",
+        "baichuan2",
+        "chatglm",
+        "qwen2",
+        "mixtral",
+        "mistral",
+        "gemma",
+    ]:
+        act_method = base_cfg["model"].activations_checkpoint_method
+    else:
+        act_method = base_cfg["model"].encoder.activations_checkpoint_method
+
+    (
+        tp_list,
+        pp_list,
+        cp_list,
+        ep_list,
+        mbs_list,
+        min_model_parallel,
+        max_model_parallel,
+        gbs,
+    ) = _calculate_tp_pp_mbs_grid(
+        model_size_in_b=model_size_in_b,
+        num_layers=num_layers,
+        model_name=model_name,
+        seq_length=seq_length,
+        train_cfg=train_cfg,
+    )
+
+    max_minutes = train_cfg.get("max_minutes_per_run")
+    max_steps = train_cfg.get("max_steps_per_run")
+    num_nodes = train_cfg.get("num_nodes")
+
+    valid_tp_pp_list = []
+    for tp in tp_list:
+        for pp in pp_list:
+            for cp in cp_list:
+                for ep in ep_list:
+                    for mbs in mbs_list:
+                        num_gpus = base_cfg["trainer"]["num_nodes"] * base_cfg["trainer"]["devices"]
+                        base_cfg["model"].global_batch_size = gbs
+                        if model_name in [
+                            "gpt3",
+                            "bert",
+                            "llama",
+                            "baichuan2",
+                            "chatglm",
+                            "qwen2",
+                            "mixtral",
+                            "mistral",
+                            "gemma",
+                        ]:
+                            att_heads = base_cfg["model"].num_attention_heads
+                            num_layers = base_cfg["model"].num_layers
+                        else:
+                            att_heads = base_cfg["model"].encoder.num_attention_heads
+                            num_layers = base_cfg["model"].encoder.num_layers
+                        model_parallelism = (tp * pp * cp * ep) if (cp and ep) else (tp * pp)
+                        mod_gbs = gbs % (mbs * num_gpus / model_parallelism)
+                        mod_att_heads = att_heads % tp
+                        mod_layers = (multiplier * num_layers) % pp
+                        mod_cp = cp if cp else 1
+                        mod_ep = ep if ep else 1
+                        if (
+                            mod_gbs == 0
+                            and mod_att_heads == 0
+                            and mod_layers == 0
+                            and (tp, pp, cp, ep) not in valid_tp_pp_list
+                            and (mod_cp // mod_ep == mod_cp or mod_ep // mod_cp == mod_ep)
+                            and min_model_parallel <= model_parallelism <= max_model_parallel
+                        ):
+                            valid_tp_pp_list.append((tp, pp, cp, ep))
+
+    # Generate grid search configs.
+    configs, base_cfg["auto_config"] = {}, {}
+    for tp, pp, cp, ep in valid_tp_pp_list:
+        (
+            virtual_pipelines,
+            act_ckpt_layers,
+            num_micro_batches_partial_act_ckpt,
+            act_ckpt_layers_per_pipeline,
+        ) = _set_activations_checkpoint_params(
+            tp,
+            pp,
+            cp,
+            ep,
+            num_layers,
+            act_method,
+            multiplier,
+            model_size_in_b,
+            model_name,
+            model_measure,
+        )
+        for mbs in mbs_list:
+            kwargs = {
+                "base_cfg": base_cfg,
+                "act": None,
+                "num_mbs_act": None,
+                "act_per_pipe": None,
+                "tp": tp,
+                "pp": pp,
+                "cp": cp,
+                "ep": ep,
+                "virtual_pipelines": virtual_pipelines,
+                "mbs": mbs,
+                "max_minutes": max_minutes,
+                "max_steps": max_steps,
+                "num_nodes": num_nodes,
+                "model_name": model_name,
+            }
+            if act_ckpt_layers[0] is not None:
+                if act_layers is not None and act_layers != "auto":
+                    act_ckpt_layers = act_layers
+                for act in act_ckpt_layers:
+                    for num_mbs_act in num_micro_batches_partial_act_ckpt:
+                        for act_per_pipe in act_ckpt_layers_per_pipeline:
+                            kwargs["act"] = act
+                            kwargs["num_mbs_act"] = num_mbs_act
+                            kwargs["act_per_pipe"] = act_per_pipe
+                            new_cfg = utils.modify_cfg(**kwargs)
+                            if new_cfg:  # Save candidate cfg.
+                                configs[new_cfg["run"]["name"]] = new_cfg
+            else:
+                new_cfg = utils.modify_cfg(**kwargs)
+                if new_cfg:  # Save candidate cfg.
+                    configs[new_cfg["run"]["name"]] = new_cfg
+
+    print(f"\nAll candidate configurations created correctly. Total number of configs: {len(configs)}.\n")
+    return configs
+
+
+def _set_activations_checkpoint_params(
+    tp, pp, cp, ep, num_layers, act_method, multiplier, model_size_in_b, model_name, model_measure
+):
+    act_multiple = 4 // pp
+    model_size_in_b = model_size_in_b / 1000 if model_measure == "M" else model_size_in_b
+    if act_method == "block":
+        if 1.0 <= model_size_in_b < 11.3:
+            act_multiple = 8 // pp
+        elif 11.3 <= model_size_in_b < 26.0:
+            act_multiple = 16 // pp
+        elif 26.0 <= model_size_in_b < 60.0:
+            act_multiple = 16 // pp
+        elif 60.0 <= model_size_in_b:
+            act_multiple = 32 // pp
+    act_multiple = max(act_multiple, 1)
+
+    virtual_pipelines = None
+    # Num micro batches with partial act ckpt
+    min_micro_b = 0  # 0 will not be used, minimum will be set to 1 later in the code.
+    max_micro_b = pp
+    interval_micro_b = 1
+    # Act ckpt layers per pipeline
+    min_layers_per_pipe = 0
+    max_layers_per_pipe = num_layers
+    interval_layers_per_pipe = act_multiple
+    if (
+        model_name in ["gpt3", "bert", "llama", "baichuan2", "chatglm", "qwen2", "mixtral", "mistral", "gemma"]
+        and pp > 2
+    ):  # Interleaved pipeline scheduling.
+        virtual_pipelines = num_layers // pp  # TODO: verify that this is the best value.
+        act_multiple = 1
+        max_micro_b = pp * (virtual_pipelines - 1) + (pp - 1) * 2 + 1
+        interval_micro_b = virtual_pipelines * 8
+        max_layers_per_pipe = multiplier * num_layers // pp // virtual_pipelines + 1
+
+    (
+        act_ckpt_layers,
+        num_micro_batches_partial_act_ckpt,
+        act_ckpt_layers_per_pipeline,
+    ) = ([None], [None], [None])
+    if act_method == "block":
+        # Act ckpt num layers
+        if virtual_pipelines is None:
+            act_ckpt_layers = range(0, multiplier * num_layers // pp + 1, act_multiple)
+        else:
+            act_ckpt_layers = range(0, multiplier * num_layers // pp // virtual_pipelines + 1, act_multiple)
+
+        if pp > 1 and model_name in [
+            "gpt3",
+            "bert",
+            "llama",
+            "baichuan2",
+            "chatglm",
+            "qwen2",
+            "mixtral",
+            "mistral",
+            "gemma",
+        ]:
+            # Num micro batches with partial act ckpt
+            num_micro_batches_partial_act_ckpt = list(range(min_micro_b, max_micro_b + 1, interval_micro_b))
+            if num_micro_batches_partial_act_ckpt[0] == 0:
+                num_micro_batches_partial_act_ckpt[0] = 1
+
+            # Act ckpt layers per pipeline
+            act_ckpt_layers_per_pipeline = range(
+                min_layers_per_pipe, max_layers_per_pipe + 1, interval_layers_per_pipe
+            )
+
+    return (
+        virtual_pipelines,
+        act_ckpt_layers,
+        num_micro_batches_partial_act_ckpt,
+        act_ckpt_layers_per_pipeline,
+    )
+
+
+@dataclass
+class GPT3GridSearch80gb:
+    """
+    Selects grid search space for TP, PP, MBS parameters for GPT-3 and 80GB GPUs.
+    :param float model_size_in_b: number of parameters in the model.
+    :param List[int] valid_pp: list of valid Pipeline Parallelism (PP) values for this config.
+    :param int seq length: sequence length to use for training.
+    :param str model_measure: measure of model size (millions or billions).
+    :returns: tuple (tp, pp, cp, ep, mbs, min_model_parallel, max_model_parallel, gbs)
+        WHERE
+        int tp is the Tensor Parallelism value to use for training.
+        int pp is the Pipeline Parallelism value to use for training.
+        int cp is the Context Parallelism value to use for training.
+        int ep is the Expert Parallelism value to use for training.
+        int mbs is the Micro Batch Size to use for training.
+        int min_model_parallel is min Model parallel size to use for training.
+        int max_model_parallel is max Model parallel size to use for training.
+        int gbs is the Global Batch Size to use for training.
+    """
+
+    model_size_in_b: int = 5
+    valid_pp: List[int]
+    seq_length: int = 2048
+    model_measure: str = "B"
+
+    tp: List[int] = field(default_factory=lambda: [1, 2, 4, 8])
+    pp: List[int] = field(default_factory=lambda: [1])
+    cp: List[int] = field(default_factory=lambda: [1])
+    ep: List[int] = field(default_factory=lambda: [1])
+    mbs: List[int] = field(default_factory=lambda: [1, 2, 3, 4, 6, 8])
+    min_model_parallel: int = 1
+    max_model_parallel: int = 8
+    gbs: int = 1024
+
+    model_size_in_b = model_size_in_b / 1000 if model_measure == "M" else model_size_in_b
+
+    if seq_length == 2048:
+        if model_size_in_b <= 1.0:
+            tp = [1, 2]
+            gbs = 256
+        elif model_size_in_b <= 4.0:
+            tp = [1, 2, 4]
+            gbs = 1024
+        elif model_size_in_b <= 8.0:
+            tp = [1, 2, 4]
+            gbs = 2048
+        elif model_size_in_b <= 13.0:
+            tp = [1, 2, 4, 8]
+            gbs = 2048
+        elif model_size_in_b <= 23.0:
+            tp = [1, 2, 4]
+            pp = [x for x in valid_pp if 1 <= x <= 4]
+            mbs = [1, 2, 4]
+            min_model_parallel = 4
+            max_model_parallel = 8
+            gbs = 2048
+        elif model_size_in_b <= 45.0:
+            tp = [2, 4, 8]
+            pp = [x for x in valid_pp if 1 <= x <= 4]
+            mbs = [1, 2, 4]
+            min_model_parallel = 8
+            max_model_parallel = 32
+            gbs = 2048
+        elif model_size_in_b <= 95:
+            tp = [2, 4, 8]
+            pp = [x for x in valid_pp if 1 <= x <= 8]
+            mbs = [1, 2, 4, 8]
+            min_model_parallel = 8
+            max_model_parallel = 64
+            gbs = 2048
+        elif model_size_in_b <= 130.0:
+            tp = [2, 4, 8]
+            pp = [x for x in valid_pp if 1 <= x <= 16]
+            mbs = [1, 2, 4, 8]
+            min_model_parallel = 16
+            max_model_parallel = 128
+            gbs = 2048
+        elif model_size_in_b <= 195.0:
+            tp = [8]
+            pp = [x for x in valid_pp if 4 <= x <= 16]
+            mbs = [1, 2, 4]
+            min_model_parallel = 32
+            max_model_parallel = 256
+            gbs = 2048
+        elif model_size_in_b <= 395.0:
+            tp = [8]
+            pp = [x for x in valid_pp if 8 <= x <= 32]
+            mbs = [1, 2, 4]
+            min_model_parallel = 64
+            max_model_parallel = 512
+            gbs = 2048
+        elif model_size_in_b <= 790.0:
+            tp = [8]
+            pp = [x for x in valid_pp if 8 <= x <= 100]
+            mbs = [1, 2, 4]
+            min_model_parallel = 128
+            max_model_parallel = 1024
+            gbs = 2048
+        elif model_size_in_b <= 1100.0:
+            tp = [8]
+            pp = [x for x in valid_pp if 16 <= x <= 130]
+            mbs = [1, 2, 4]
+            min_model_parallel = 256
+            max_model_parallel = 2048
+            gbs = 2048
+    elif seq_length == 4096:
+        if model_size_in_b <= 1.0:
+            tp = [1, 2, 4]
+            mbs = [1, 2, 4, 8]
+            gbs = 128
+        elif model_size_in_b <= 4.0:
+            tp = [1, 2, 4]
+            mbs = [1, 2, 4, 8]
+            gbs = 512
+        elif model_size_in_b <= 8.0:
+            tp = [1, 2, 4]
+            pp = [x for x in valid_pp if 1 <= x <= 2]
+            mbs = [1, 2, 4]
+            gbs = 1024
+        elif model_size_in_b <= 13.0:
+            tp = [2, 4]
+            pp = [x for x in valid_pp if 1 <= x <= 2]
+            mbs = [1, 2, 4]
+            gbs = 1024
+        elif model_size_in_b <= 23.0:
+            tp = [4, 8]
+            pp = [x for x in valid_pp if 1 <= x <= 2]
+            mbs = [1, 2]
+            min_model_parallel = 4
+            max_model_parallel = 16
+            gbs = 1024
+        elif model_size_in_b <= 45.0:
+            tp = [4, 8]
+            pp = [x for x in valid_pp if 2 <= x <= 4]
+            mbs = [1, 2]
+            min_model_parallel = 8
+            max_model_parallel = 32
+            gbs = 1024
+        elif model_size_in_b <= 95:
+            tp = [4, 8]
+            pp = [x for x in valid_pp if 1 <= x <= 8]
+            mbs = [1, 2]
+            min_model_parallel = 8
+            max_model_parallel = 64
+            gbs = 1024
+    elif seq_length == 8192:
+        if model_size_in_b <= 1.0:
+            tp = [1, 2]
+            pp = [x for x in valid_pp if 1 <= x <= 2]
+            mbs = [1, 2, 4]
+            gbs = 64
+        elif model_size_in_b <= 4.0:
+            tp = [1, 2, 4]
+            pp = [x for x in valid_pp if 1 <= x <= 2]
+            mbs = [1, 2, 4]
+            gbs = 128
+        elif model_size_in_b <= 8.0:
+            tp = [2, 4]
+            pp = [x for x in valid_pp if 1 <= x <= 2]
+            mbs = [1, 2]
+            gbs = 256
+        elif model_size_in_b <= 13.0:
+            tp = [2, 4]
+            pp = [x for x in valid_pp if 1 <= x <= 2]
+            mbs = [1, 2]
+            gbs = 256
+        elif model_size_in_b <= 23.0:
+            tp = [4, 8]
+            pp = [x for x in valid_pp if 1 <= x <= 4]
+            mbs = [1]
+            min_model_parallel = 8
+            max_model_parallel = 32
+            gbs = 256
+        elif model_size_in_b <= 45.0:
+            tp = [8]
+            pp = [x for x in valid_pp if 4 <= x <= 8]
+            mbs = [1]
+            min_model_parallel = 32
+            max_model_parallel = 64
+            gbs = 256
+    elif seq_length == 16384:
+        if model_size_in_b <= 1.0:
+            tp = [2, 4]
+            mbs = [1, 2]
+            gbs = 32
+        elif model_size_in_b <= 4.0:
+            tp = [2, 4]
+            pp = [x for x in valid_pp if 1 <= x <= 2]
+            mbs = [1]
+            gbs = 64
+        elif model_size_in_b <= 8.0:
+            tp = [2, 4]
+            pp = [x for x in valid_pp if 1 <= x <= 2]
+            mbs = [1]
+            gbs = 128
+        elif model_size_in_b <= 13.0:
+            tp = [2, 4]
+            pp = [x for x in valid_pp if 1 <= x <= 2]
+            mbs = [1]
+            gbs = 128
+        elif model_size_in_b <= 23.0:
+            tp = [4, 8]
+            pp = [x for x in valid_pp if 2 <= x <= 4]
+            mbs = [1]
+            min_model_parallel = 8
+            max_model_parallel = 32
+            gbs = 128
+    elif seq_length == 32768:
+        if model_size_in_b <= 1.0:
+            tp = [2, 4]
+            pp = [x for x in valid_pp if 1 <= x <= 2]
+            mbs = [1]
+            gbs = 16
+        elif model_size_in_b <= 4.0:
+            tp = [2, 4]
+            pp = [x for x in valid_pp if 1 <= x <= 2]
+            mbs = [1]
+            gbs = 32
+        elif model_size_in_b <= 8.0:
+            tp = [4, 8]
+            pp = [x for x in valid_pp if 1 <= x <= 2]
+            min_model_parallel = 4
+            max_model_parallel = 16
+            mbs = [1]
+            gbs = 64
+        elif model_size_in_b <= 13.0:
+            tp = [4, 8]
+            pp = [x for x in valid_pp if 1 <= x <= 2]
+            min_model_parallel = 4
+            max_model_parallel = 16
+            mbs = [1]
+            gbs = 64
+        elif model_size_in_b <= 23.0:
+            tp = [8]
+            pp = [x for x in valid_pp if 2 <= x <= 4]
+            mbs = [1]
+            min_model_parallel = 16
+            max_model_parallel = 32
+            gbs = 64
+
+
+def _tp_pp_mbs_grid_gpt3_40gb(model_size_in_b: float, valid_pp: List[int], model_measure: str) -> Tuple[int, int, int]:
+    """
+    Selects grid search space for TP, PP, MBS parameters for GPT-3 and 40GB GPUs.
+    :param float model_size_in_b: number of parameters in the model.
+    :param List[int] valid_pp: list of valid Pipeline Parallelism (PP) values for this config.
+    :param str model_measure: measure of model size (millions or billions).
+    :returns: tuple (tp, pp, cp, ep, mbs, min_model_parallel, max_model_parallel, gbs)
+        WHERE
+        int tp is the Tensor Parallelism value to use for training.
+        int pp is the Pipeline Parallelism value to use for training.
+        int cp is the Context Parallelism value to use for training.
+        int ep is the Expert Parallelism value to use for training.
+        int mbs is the Micro Batch Size to use for training.
+        int min_model_parallel is min Model parallel size to use for training.
+        int max_model_parallel is max Model parallel size to use for training.
+        int gbs is the Global Batch Size to use for training.
+    """
+    tp = [1, 2, 4, 8]
+    pp = [1]
+    cp = [1]
+    ep = [1]
+    mbs = [1, 2, 4, 6, 8, 10, 12, 16]
+    min_model_parallel = 1
+    max_model_parallel = 8
+    gbs = 1024
+    model_size_in_b = model_size_in_b / 1000 if model_measure == "M" else model_size_in_b
+    if model_size_in_b <= 1.0:
+        tp = [1, 2, 4]
+        mbs = [1, 2, 4, 8]
+        gbs = 256
+    elif model_size_in_b <= 4.0:
+        tp = [1, 2, 4, 8]
+        mbs = [1, 2, 4, 8]
+        gbs = 1024
+    elif model_size_in_b <= 8.0:
+        tp = [2, 4, 8]
+        pp = [1, 2]
+        mbs = [1, 2, 4]
+        min_model_parallel = 2
+        gbs = 2048
+    elif model_size_in_b <= 13.0:
+        tp = [4, 8]
+        pp = [1, 2, 4]
+        mbs = [1, 2, 4]
+        min_model_parallel = 4
+        max_model_parallel = 32
+        gbs = 2048
+    elif model_size_in_b <= 23.0:
+        tp = [2, 4, 8]
+        pp = [x for x in valid_pp if 1 <= x <= 8]
+        min_model_parallel = 8
+        max_model_parallel = 64
+        gbs = 2048
+    elif model_size_in_b <= 45.0:
+        tp = [4, 8]
+        pp = [x for x in valid_pp if 1 <= x <= 12]
+        mbs = [1, 2, 4]
+        min_model_parallel = 16
+        max_model_parallel = 128
+        gbs = 2048
+    elif model_size_in_b <= 95:
+        tp = [4, 8]
+        pp = [x for x in valid_pp if 1 <= x <= 16]
+        mbs = [1, 2, 4]
+        min_model_parallel = 16
+        max_model_parallel = 256
+        gbs = 2048
+    elif model_size_in_b <= 130.0:
+        tp = [4, 8]
+        pp = [x for x in valid_pp if 2 <= x <= 26]
+        mbs = [1, 2]
+        min_model_parallel = 32
+        max_model_parallel = 512
+        gbs = 2048
+    elif model_size_in_b <= 195.0:
+        tp = [4, 8]
+        pp = [x for x in valid_pp if 2 <= x <= 32]
+        mbs = [1, 2]
+        min_model_parallel = 64
+        max_model_parallel = 1024
+        gbs = 2048
+    elif model_size_in_b <= 395.0:
+        tp = [4, 8]
+        pp = [x for x in valid_pp if 4 <= x <= 64]
+        mbs = [1, 2]
+        min_model_parallel = 128
+        max_model_parallel = 2048
+        gbs = 2048
+    elif model_size_in_b <= 790.0:
+        tp = [4, 8]
+        pp = [x for x in valid_pp if 8 <= x <= 128]
+        mbs = [1, 2]
+        min_model_parallel = 256
+        max_model_parallel = 4096
+        gbs = 2048
+    elif model_size_in_b <= 1100.0:
+        tp = [4, 8]
+        pp = [x for x in valid_pp if 8 <= x <= 192]
+        mbs = [1, 2]
+        min_model_parallel = 512
+        max_model_parallel = 8192
+        gbs = 2048
+    return tp, pp, cp, ep, mbs, min_model_parallel, max_model_parallel, gbs
+
+
+def _tp_pp_mbs_grid_t5_80gb(model_size_in_b: float, valid_pp: List[int], model_measure: str) -> Tuple[int, int, int]:
+    """
+    Selects grid search space for TP, PP, MBS parameters for T5/mT5 and 80GB GPUs.
+    :param float model_size_in_b: number of parameters in the model.
+    :param List[int] valid_pp: list of valid Pipeline Parallelism (PP) values for this config.
+    :param str model_measure: measure of model size (millions or billions).
+    :returns: tuple (tp, pp, cp, ep, mbs, min_model_parallel, max_model_parallel, gbs)
+        WHERE
+        int tp is the Tensor Parallelism value to use for training.
+        int pp is the Pipeline Parallelism value to use for training.
+        int cp is the Context Parallelism value to use for training.
+        int ep is the Expert Parallelism value to use for training.
+        int mbs is the Micro Batch Size to use for training.
+        int min_model_parallel is min Model parallel size to use for training.
+        int max_model_parallel is max Model parallel size to use for training.
+        int gbs is the Global Batch Size to use for training.
+    """
+    tp = [1, 2, 4, 8]
+    pp = [1]
+    cp = [None]
+    ep = [None]
+    mbs = [1, 2, 4, 6, 8, 12, 16]
+    min_model_parallel = 1
+    max_model_parallel = 8
+    gbs = 1920
+    model_size_in_b = model_size_in_b / 1000 if model_measure == "M" else model_size_in_b
+    if model_size_in_b <= 1.0:
+        tp = [1, 2]
+        mbs = [16, 32, 64, 128]
+        gbs = 2048
+    elif model_size_in_b <= 4.0:
+        tp = [1, 2, 4]
+        mbs = [4, 6, 8, 12, 16, 24, 32, 48]
+        gbs = 1920
+    elif model_size_in_b <= 8.0:
+        tp = [2, 4, 8]
+        mbs = [4, 6, 8, 12, 16, 24, 32]
+        gbs = 1920
+    elif model_size_in_b <= 14.5:
+        tp = [4, 8]
+        mbs = [2, 4, 6, 8, 12, 16, 24]
+        gbs = 1920
+    elif model_size_in_b <= 25.9:
+        tp = [4, 8]
+        pp = [x for x in valid_pp if 1 <= x <= 2]
+        mbs = [1, 2, 4, 6, 8]
+        min_model_parallel = 4
+        max_model_parallel = 16
+        gbs = 1920
+    elif model_size_in_b <= 43.0:
+        tp = [4, 8]
+        pp = [x for x in valid_pp if 1 <= x <= 4]
+        mbs = [1, 2, 4, 6, 8]
+        min_model_parallel = 8
+        max_model_parallel = 32
+        gbs = 1920
+    elif model_size_in_b <= 85.5:
+        tp = [4, 8]
+        pp = [x for x in valid_pp if 2 <= x <= 8]
+        mbs = [1, 2, 4, 6, 8]
+        min_model_parallel = 16
+        max_model_parallel = 64
+        gbs = 1920
+    elif model_size_in_b <= 165.5:
+        tp = [8]
+        pp = [x for x in valid_pp if 4 <= x <= 16]
+        mbs = [1, 2, 4, 6]
+        min_model_parallel = 32
+        max_model_parallel = 128
+        gbs = 1920
+    elif model_size_in_b <= 250:
+        tp = [8]
+        pp = [x for x in valid_pp if 4 <= x <= 32]
+        mbs = [1, 2, 4, 6, 8]
+        min_model_parallel = 64
+        max_model_parallel = 256
+        gbs = 1920
+    return tp, pp, cp, ep, mbs, min_model_parallel, max_model_parallel, gbs
+
+
+def _tp_pp_mbs_grid_t5_40gb(model_size_in_b: float, valid_pp: List[int], model_measure: str) -> Tuple[int, int, int]:
+    """
+    Selects grid search space for TP, PP, MBS parameters for T5/mT5 and 40GB GPUs.
+    :param float model_size_in_b: number of parameters in the model.
+    :param List[int] valid_pp: list of valid Pipeline Parallelism (PP) values for this config.
+    :param str model_measure: measure of model size (millions or billions).
+    :returns: tuple (tp, pp, cp, ep, mbs, min_model_parallel, max_model_parallel, gbs)
+        WHERE
+        int tp is the Tensor Parallelism value to use for training.
+        int pp is the Pipeline Parallelism value to use for training.
+        int cp is the Context Parallelism value to use for training.
+        int ep is the Expert Parallelism value to use for training.
+        int mbs is the Micro Batch Size to use for training.
+        int min_model_parallel is min Model parallel size to use for training.
+        int max_model_parallel is max Model parallel size to use for training.
+        int gbs is the Global Batch Size to use for training.
+    """
+    tp = [1, 2, 4, 8]
+    pp = [1]
+    cp = [None]
+    ep = [None]
+    mbs = [1, 2, 4, 6, 8, 12, 16]
+    min_model_parallel = 1
+    max_model_parallel = 8
+    gbs = 1920
+    model_size_in_b = model_size_in_b / 1000 if model_measure == "M" else model_size_in_b
+    if model_size_in_b <= 1.0:
+        tp = [1, 2]
+        mbs = [16, 32, 64, 128]
+        gbs = 2048
+    elif model_size_in_b <= 4.0:
+        tp = [1, 2, 4]
+        mbs = [4, 8, 12, 16, 24, 32, 48]
+        gbs = 1920
+    elif model_size_in_b <= 8.0:
+        tp = [2, 4, 8]
+        mbs = [4, 6, 8, 12, 16, 24]
+        gbs = 1920
+    elif model_size_in_b <= 14.5:
+        tp = [4, 8]
+        pp = [x for x in valid_pp if 1 <= x <= 2]
+        mbs = [2, 4, 6, 8, 12, 16]
+        min_model_parallel = 4
+        max_model_parallel = 16
+        gbs = 1920
+    elif model_size_in_b <= 25.9:
+        tp = [4, 8]
+        pp = [x for x in valid_pp if 1 <= x <= 8]
+        mbs = [1, 2, 4, 6, 8]
+        min_model_parallel = 8
+        max_model_parallel = 32
+        gbs = 1920
+    elif model_size_in_b <= 43.0:
+        tp = [4, 8]
+        pp = [x for x in valid_pp if 1 <= x <= 8]
+        mbs = [1, 2, 4, 6, 8]
+        min_model_parallel = 16
+        max_model_parallel = 32
+        gbs = 1920
+    elif model_size_in_b <= 85.5:
+        tp = [8]
+        pp = [x for x in valid_pp if 2 <= x <= 8]
+        mbs = [1, 2, 4, 6, 8]
+        min_model_parallel = 32
+        max_model_parallel = 64
+        gbs = 1920
+    elif model_size_in_b <= 165.5:
+        tp = [8]
+        pp = [x for x in valid_pp if 4 <= x <= 32]
+        mbs = [1, 2, 4]
+        min_model_parallel = 64
+        max_model_parallel = 128
+        gbs = 1920
+    elif model_size_in_b <= 250:
+        tp = [8]
+        pp = [x for x in valid_pp if 8 <= x <= 64]
+        mbs = [1, 2, 4]
+        min_model_parallel = 128
+        max_model_parallel = 256
+        gbs = 1920
+    return tp, pp, cp, ep, mbs, min_model_parallel, max_model_parallel, gbs
+
+
+def _tp_pp_mbs_grid_bert_80gb(model_size_in_b: float, valid_pp: List[int], model_measure: str) -> Tuple[int, int, int]:
+    """
+    Selects grid search space for TP, PP, MBS parameters for BERT and 80GB GPUs.
+    :param float model_size_in_b: number of parameters in the model.
+    :param List[int] valid_pp: list of valid Pipeline Parallelism (PP) values for this config.
+    :param str model_measure: measure of model size (millions or billions).
+    :returns: tuple (tp, pp, cp, ep, mbs, min_model_parallel, max_model_parallel, gbs)
+        WHERE
+        int tp is the Tensor Parallelism value to use for training.
+        int pp is the Pipeline Parallelism value to use for training.
+        int cp is the Context Parallelism value to use for training.
+        int ep is the Expert Parallelism value to use for training.
+        int mbs is the Micro Batch Size to use for training.
+        int min_model_parallel is min Model parallel size to use for training.
+        int max_model_parallel is max Model parallel size to use for training.
+        int gbs is the Global Batch Size to use for training.
+    """
+    pp = [1]
+    cp = [None]
+    ep = [None]
+    mbs = [1, 2, 3, 4, 6, 8]
+    min_model_parallel = 1
+    max_model_parallel = 8
+    gbs = 1024
+    model_size_in_b = model_size_in_b / 1000 if model_measure == "M" else model_size_in_b
+    if model_size_in_b <= 1.0:
+        tp = [1, 2]
+        gbs = 256
+    elif model_size_in_b <= 4.0:
+        tp = [1, 2, 4]
+        gbs = 1024
+    elif model_size_in_b <= 8.0:
+        tp = [2, 4, 8]
+        min_model_parallel = 2
+        gbs = 2048
+    elif model_size_in_b <= 13.0:
+        tp = [2, 4, 8]
+        mbs = [1, 2, 3, 4, 6]
+        min_model_parallel = 2
+        gbs = 2048
+    elif model_size_in_b <= 25.0:
+        tp = [4, 8]
+        mbs = [1, 2, 3, 4]
+        min_model_parallel = 4
+        gbs = 2048
+    elif model_size_in_b <= 46.5:
+        tp = [4, 8]
+        pp = [1, 2, 4]
+        mbs = [1, 2, 3, 4]
+        min_model_parallel = 4
+        max_model_parallel = 16
+        gbs = 2048
+    elif model_size_in_b <= 87.5:
+        tp = [4, 8]
+        pp = [2, 4, 6, 8]
+        mbs = [1, 2, 3, 4]
+        min_model_parallel = 8
+        max_model_parallel = 32
+        gbs = 2048
+    elif model_size_in_b <= 165.5:
+        tp = [4, 8]
+        pp = [4, 6, 8, 16]
+        mbs = [2, 4, 6, 8]
+        min_model_parallel = 16
+        max_model_parallel = 128
+        gbs = 2048
+    elif model_size_in_b <= 250.5:
+        tp = [8]
+        pp = [4, 8, 16, 32]
+        mbs = [1, 2, 3, 4]
+        min_model_parallel = 32
+        max_model_parallel = 256
+        gbs = 2048
+    else:
+        raise ValueError("No BERT model larger than 250B parameters is supported.")
+    return tp, pp, cp, ep, mbs, min_model_parallel, max_model_parallel, gbs
+
+
+def _tp_pp_mbs_grid_bert_40gb(model_size_in_b: float, valid_pp: List[int], model_measure: str) -> Tuple[int, int, int]:
+    """
+    Selects grid search space for TP, PP, MBS parameters for BERT and 40GB GPUs.
+    :param float model_size_in_b: number of parameters in the model.
+    :param List[int] valid_pp: list of valid Pipeline Parallelism (PP) values for this config.
+    :param str model_measure: measure of model size (millions or billions).
+    :returns: tuple (tp, pp, cp, ep, mbs, min_model_parallel, max_model_parallel, gbs)
+        WHERE
+        int tp is the Tensor Parallelism value to use for training.
+        int pp is the Pipeline Parallelism value to use for training.
+        int cp is the Context Parallelism value to use for training.
+        int ep is the Expert Parallelism value to use for training.
+        int mbs is the Micro Batch Size to use for training.
+        int min_model_parallel is min Model parallel size to use for training.
+        int max_model_parallel is max Model parallel size to use for training.
+        int gbs is the Global Batch Size to use for training.
+    """
+    pp = [1]
+    cp = [None]
+    ep = [None]
+    mbs = [1, 2, 4, 6, 8]
+    min_model_parallel = 1
+    max_model_parallel = 8
+    gbs = 1024
+    model_size_in_b = model_size_in_b / 1000 if model_measure == "M" else model_size_in_b
+    if model_size_in_b <= 1.0:
+        tp = [1, 2, 4]
+        gbs = 256
+    elif model_size_in_b <= 4.0:
+        tp = [1, 2, 4, 8]
+        gbs = 1024
+    elif model_size_in_b <= 8.0:
+        tp = [2, 4, 8]
+        mbs = [1, 2, 4]
+        gbs = 2048
+    elif model_size_in_b <= 13.0:
+        tp = [2, 4, 8]
+        mbs = [1, 2, 4]
+        gbs = 2048
+    elif model_size_in_b <= 25.0:
+        tp = [2, 4, 8]
+        pp = [1, 2]
+        mbs = [1, 2, 4]
+        min_model_parallel = 2
+        max_model_parallel = 16
+        gbs = 2048
+    elif model_size_in_b <= 46.5:
+        tp = [4, 8]
+        pp = [1, 2, 4, 8]
+        mbs = [1, 2, 3]
+        min_model_parallel = 8
+        max_model_parallel = 32
+        gbs = 2048
+    elif model_size_in_b <= 87.5:
+        tp = [4, 8]
+        pp = [2, 4, 6, 8]
+        mbs = [1, 2, 3]
+        min_model_parallel = 16
+        max_model_parallel = 64
+        gbs = 2048
+    elif model_size_in_b <= 165.5:
+        tp = [8]
+        pp = [4, 6, 8, 16]
+        mbs = [1, 2]
+        min_model_parallel = 32
+        max_model_parallel = 256
+        gbs = 2048
+    elif model_size_in_b <= 250.5:
+        tp = [8]
+        pp = [8, 16, 32]
+        mbs = [1, 2]
+        min_model_parallel = 64
+        max_model_parallel = 512
+        gbs = 2048
+    else:
+        raise ValueError("No BERT model larger than 250B parameters is supported.")
+    return tp, pp, cp, ep, mbs, min_model_parallel, max_model_parallel, gbs
+
+
+def _calculate_tp_pp_mbs_grid(
+    model_size_in_b: float,
+    num_layers: int,
+    model_name: str,
+    seq_length: int,
+    train_cfg: dict,
+) -> Tuple[int, int, int]:
+    """
+    Selects grid search space for TP, PP, MBS parameters for any model, and calls the necessary
+    heuristics function accordingly.
+    :param float model_size_in_b: number of parameters in the model.
+    :param int num_layers: number of layers in the model config.
+    :param str model_name: name of the model to be used, such as gpt3, t5, mt5...
+    :param int seq_length: sequence length to use for training.
+    :param dict train_cfg: config of the model that will be launched.
+    :returns: tuple (tp, pp, cp, ep, mbs, min_model_parallel, max_model_parallel, gbs)
+        WHERE
+        int tp is the Tensor Parallelism value to use for training.
+        int pp is the Pipeline Parallelism value to use for training.
+        int cp is the Context Parallelism value to use for training.
+        int ep is the Expert Parallelism value to use for training.
+        int mbs is the Micro Batch Size to use for training.
+        int min_model_parallel is min Model parallel size to use for training.
+        int max_model_parallel is max Model parallel size to use for training.
+        int gbs is the Global Batch Size to use for training.
+    :raises NotImplementedError: if the model_name is not one of the supported models.
+    """
+    tp_sizes = train_cfg.get("tensor_parallel_sizes")
+    pp_sizes = train_cfg.get("pipeline_parallel_sizes")
+    cp_sizes = train_cfg.get("context_parallel_sizes", None)
+    ep_sizes = train_cfg.get("expert_parallel_sizes", None)
+    min_model_parallel_size = train_cfg.get("min_model_parallel_size")
+    max_model_parallel_size = train_cfg.get("max_model_parallel_size")
+    mbs_sizes = train_cfg.get("micro_batch_sizes")
+    gbs_size = train_cfg.get("global_batch_size")
+    gpu_memory_gb = train_cfg.get("gpu_memory_gb")
+    model_measure = train_cfg.get("model_measure")
+    multiplier = (
+        1
+        if model_name in ["gpt3", "bert", "llama", "baichuan2", "chatglm", "qwen2", "mixtral", "mistral", "gemma"]
+        else 2
+    )
+    init_pp = (
+        [] if model_name in ["gpt3", "llama", "baichuan2", "chatglm", "qwen2", "mixtral", "mistral", "gemma"] else [1]
+    )
+    valid_pp = init_pp + [
+        multiplier * x for x in range(1, num_layers + 1) if num_layers % x == 0
+    ]  # Only divisors of num_layers are possible.
+
+    if model_name in ["gpt3", "llama", "baichuan2", "chatglm", "qwen2", "mixtral", "mistral", "gemma"]:
+        if gpu_memory_gb == 80:
+            print(model_size_in_b, valid_pp, seq_length, model_measure)
+            params = GPT3GridSearch80gb(
+                model_size_in_b=model_size_in_b, valid_pp=valid_pp, seq_length=seq_length, model_measure=model_measure
+            )
+        elif gpu_memory_gb == 40:
+            (
+                tp,
+                pp,
+                cp,
+                ep,
+                mbs,
+                min_model_parallel,
+                max_model_parallel,
+                gbs,
+            ) = _tp_pp_mbs_grid_gpt3_40gb(
+                model_size_in_b=model_size_in_b, valid_pp=valid_pp, model_measure=model_measure
+            )
+    elif model_name in ["t5", "mt5"]:
+        if gpu_memory_gb == 80:
+            (
+                tp,
+                pp,
+                cp,
+                ep,
+                mbs,
+                min_model_parallel,
+                max_model_parallel,
+                gbs,
+            ) = _tp_pp_mbs_grid_t5_80gb(
+                model_size_in_b=model_size_in_b, valid_pp=valid_pp, model_measure=model_measure
+            )
+        elif gpu_memory_gb == 40:
+            (
+                tp,
+                pp,
+                cp,
+                ep,
+                mbs,
+                min_model_parallel,
+                max_model_parallel,
+                gbs,
+            ) = _tp_pp_mbs_grid_t5_40gb(
+                model_size_in_b=model_size_in_b, valid_pp=valid_pp, model_measure=model_measure
+            )
+    elif model_name == "bert":
+        if gpu_memory_gb == 80:
+            (
+                tp,
+                pp,
+                cp,
+                ep,
+                mbs,
+                min_model_parallel,
+                max_model_parallel,
+                gbs,
+            ) = _tp_pp_mbs_grid_bert_80gb(
+                model_size_in_b=model_size_in_b, valid_pp=valid_pp, model_measure=model_measure
+            )
+        elif gpu_memory_gb == 40:
+            (
+                tp,
+                pp,
+                cp,
+                ep,
+                mbs,
+                min_model_parallel,
+                max_model_parallel,
+                gbs,
+            ) = _tp_pp_mbs_grid_bert_40gb(
+                model_size_in_b=model_size_in_b, valid_pp=valid_pp, model_measure=model_measure
+            )
+    else:
+        raise NotImplementedError("Model name not implemented.")
+
+    # Override the tp, pp, mbs search if indicated in the config params.
+    if tp_sizes is not None and tp_sizes != "auto":
+        tp = tp_sizes
+    if pp_sizes is not None and pp_sizes != "auto":
+        pp = pp_sizes
+    if cp_sizes is not None and cp_sizes != "auto":
+        cp = cp_sizes
+    if ep_sizes is not None and ep_sizes != "auto":
+        ep = ep_sizes
+    if mbs_sizes is not None and mbs_sizes != "auto":
+        mbs = mbs_sizes
+    if gbs_size is not None and gbs_size != "auto":
+        gbs = gbs_size
+    if min_model_parallel_size is not None and min_model_parallel_size != "auto":
+        min_model_parallel = min_model_parallel_size
+    if max_model_parallel_size is not None and max_model_parallel_size != "auto":
+        max_model_parallel = max_model_parallel_size
+    return params
diff --git a/nemo/collections/llm/tools/auto_configurator/core/utils.py b/nemo/collections/llm/tools/auto_configurator/core/utils.py
new file mode 100644
index 000000000000..6e2a9a8208d0
--- /dev/null
+++ b/nemo/collections/llm/tools/auto_configurator/core/utils.py
@@ -0,0 +1,529 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Utility functions for the Auto Configurator tool."""
+import copy
+from typing import List, Optional, Tuple
+
+from nemo.collections.llm.tools.auto_configurator import base_configs
+
+MODULES = {
+    "gpt3": "GPT",
+    "llama": "Llama",
+    "mixtral": "Mixtral",
+    "mistral": "Mistral",
+    "gemma": "Gemma",
+}
+
+
+def _calculate_model_size(
+    vocab_size: int = None,
+    seq_length: int = None,
+    hidden_size: int = None,
+    num_layers: int = None,
+    ffn_size: int = None,
+    kv_channels: int = None,
+    att_heads: int = None,
+    model_name: str = "gpt3",
+):
+    """
+    Calculates the model size (number of parameters in billions), given the model parameters
+    and name.
+    :param int vocab_size: vocabulary size to be used during training.
+    :param int seq_length: input sequence length to be used during training.
+    :param int hidden_size: size of the hidden layers of the model.
+    :param int num_layers: number of layers in the model.
+    :param int ffn_size: FFN size of the model.
+    :param int kv_channels: number of KV channels in the transformer layers.
+    :param int att_heads: number of attention heads in the transformer layers.
+    :param str model_name: name of the model, i.e gpt3, t5, mt5...
+    :return: size of the model in billions of parameters.
+    :rtype: float
+    :raises NotImplementedError: if the model name is not valid.
+    """
+    if model_name in ["gpt3", "llama", "baichuan2", "chatglm", "qwen2", "mixtral", "mistral", "gemma"]:
+        model_size = (
+            12
+            * num_layers
+            * hidden_size**2
+            * (1 + (13 / (12 * hidden_size)) + ((vocab_size + seq_length) / (12 * num_layers * hidden_size)))
+            / 1e9
+        )
+    elif model_name in ["t5", "mt5"]:
+        # 2 L F + 3 L P + H (2 + 4 L F + L (21 + 12 P) + 1 S + 1 V)
+        proj_size = att_heads * kv_channels
+        model_size = (
+            2 * num_layers * 1.5 * ffn_size
+            + 3 * num_layers * proj_size
+            + hidden_size
+            * (2 + 4 * num_layers * 1.5 * ffn_size + num_layers * (21 + 12 * proj_size) + seq_length + vocab_size)
+        ) / 1e9
+    elif model_name == "bert":
+        model_size = (
+            num_layers * (ffn_size + hidden_size * (4 * hidden_size + 3 * att_heads + 2 * ffn_size + 6))
+            + hidden_size * (vocab_size + seq_length + hidden_size + 5)
+        ) / 1e9
+
+    else:
+        raise NotImplementedError("Model name is not valid.")
+
+    return model_size
+
+
+def calculate_model_size_params(
+    model_size_in_b: float,
+    vocab_size: int = 51200,
+    seq_length: int = 2048,
+    model_name: str = "gpt3",
+) -> Tuple[int, int, float]:
+    """
+    Calculates the parameters that affect model_size: hidden size, attention heads,
+    KV channels, and FFN size. It also calculates the learning rate.
+    :param float model_size_in_b: float, number of parameters in the desired model config, in billions.
+    :param int seq_length: int, sequence length to be used during training.
+    :param int vocab_size: int, size of the vocabulary to use for training.
+    :param str model_name: str, name of the model to be trained, i.e. gpt3, t5, mt5...
+    :returns: tuple (layers, hs, att_h, ffn, kv, lr)
+        WHERE
+        int layers is the number of layers in the model.
+        int hs is the hidden size of the model.
+        int att_h is the number of attention heads in the model.
+        int ffn is the FFN hidden size of the model.
+        int kv is the number of KV channels in the model.
+        float lr is the learning rate used to train the model.
+    :raises ValueError: if the model size is larger than the max supported model size.
+    :raises NotImplementedError: if the model name is not supported.
+    """
+    ffn, kv = None, None  # Only needed for some models.
+    if model_name in ["gpt3", "llama", "baichuan2", "chatglm", "qwen2", "mixtral", "mistral", "gemma"]:
+        if model_size_in_b < 0.25:
+            hs, att_h, lr = 768, 12, 6e-4
+        elif model_size_in_b < 0.5:
+            hs, att_h, lr = 1024, 16, 3e-4
+        elif model_size_in_b < 1:
+            hs, att_h, lr = 1536, 16, 2.5e-4
+        elif model_size_in_b < 2:
+            hs, att_h, lr = 2048, 16, 2e-4
+        elif model_size_in_b < 3:
+            hs, att_h, lr = 2560, 32, 1.6e-4
+        elif model_size_in_b < 4.5:
+            hs, att_h, lr = 3072, 32, 1.4e-4
+        elif model_size_in_b < 8:
+            hs, att_h, lr = 4096, 32, 1.2e-4
+        elif model_size_in_b < 15:
+            hs, att_h, lr = 5120, 40, 1e-4
+        elif model_size_in_b < 25:
+            hs, att_h, lr = 6144, 48, 1e-4
+        elif model_size_in_b < 52:
+            hs, att_h, lr = 8192, 64, 0.8e-4
+        elif model_size_in_b < 105:
+            hs, att_h, lr = 10240, 80, 0.7e-4
+        elif model_size_in_b < 205:
+            hs, att_h, lr = 12288, 96, 0.6e-4
+        elif model_size_in_b < 405:
+            hs, att_h, lr = 20480, 128, 0.5e-4
+        elif model_size_in_b < 805:
+            hs, att_h, lr = 20480, 128, 0.4e-4
+        elif model_size_in_b < 1105:
+            hs, att_h, lr = 25600, 160, 0.3e-4
+        else:
+            raise ValueError("Model_size for GPT-3 must be smaller than 1.1T parameters.")
+    elif model_name == "t5":
+        kv, lr = 64, 1e-4
+        if model_size_in_b < 0.1:
+            hs, att_h, ffn = 512, 6, 1024
+        elif model_size_in_b < 0.4:
+            hs, att_h, ffn = 768, 12, 2048
+        elif model_size_in_b < 1:
+            hs, att_h, ffn = 1024, 16, 2816
+        elif model_size_in_b < 5:
+            hs, att_h, ffn = 2048, 32, 5120
+        elif model_size_in_b < 15:
+            hs, att_h, ffn = 4096, 64, 10240
+        elif model_size_in_b < 25.9:
+            hs, att_h, ffn = 5120, 80, 10880
+        elif model_size_in_b < 43.0:
+            hs, att_h, ffn = 6144, 96, 10880
+        elif model_size_in_b <= 85.5:
+            hs, att_h, ffn = 6144, 96, 16384
+        elif model_size_in_b <= 165.5:
+            hs, att_h, ffn, kv = 7680, 96, 20480, 128
+        elif model_size_in_b <= 250:
+            hs, att_h, ffn, kv = 12288, 96, 32768, 128
+        else:
+            raise ValueError("Model_size for T5 must be smaller than 250B parameters.")
+    elif model_name == "mt5":
+        kv, lr = 64, 1e-4
+        if model_size_in_b < 0.25:
+            hs, att_h, ffn = 512, 6, 1024
+        elif model_size_in_b < 0.5:
+            hs, att_h, ffn = 768, 12, 2048
+        elif model_size_in_b < 1.2:
+            hs, att_h, ffn = 1024, 16, 2816
+        elif model_size_in_b < 5:
+            hs, att_h, ffn = 2048, 32, 5120
+        elif model_size_in_b < 15:
+            hs, att_h, ffn = 4096, 64, 10240
+        elif model_size_in_b < 25.9:
+            hs, att_h, ffn = 5120, 80, 10880
+        elif model_size_in_b < 43.0:
+            hs, att_h, ffn = 6144, 96, 10880
+        elif model_size_in_b <= 85.5:
+            hs, att_h, ffn = 6144, 96, 16384
+        elif model_size_in_b <= 165.5:
+            hs, att_h, ffn, kv = 7680, 96, 20480, 128
+        elif model_size_in_b <= 250:
+            hs, att_h, ffn, kv = 12288, 96, 32768, 128
+        else:
+            raise ValueError("Model_size for mT5 must be smaller than 250B parameters.")
+    elif model_name == "bert":
+        lr = 1e-4
+        if model_size_in_b < 0.25:
+            hs, att_h, lr = 768, 12, 2e-4
+        elif model_size_in_b < 0.5:
+            hs, att_h, lr = 1024, 16, 2e-4
+        elif model_size_in_b < 1:
+            hs, att_h = 1536, 16
+        elif model_size_in_b < 2:
+            hs, att_h = 2048, 16
+        elif model_size_in_b < 3:
+            hs, att_h = 2560, 32
+        elif model_size_in_b < 4.5:
+            hs, att_h = 2560, 32
+        elif model_size_in_b < 8:
+            hs, att_h = 4096, 32
+        elif model_size_in_b < 15:
+            hs, att_h = 5120, 40
+        elif model_size_in_b <= 25:
+            hs, att_h = 6144, 48
+        elif model_size_in_b <= 46.5:
+            hs, att_h = 7680, 48
+        elif model_size_in_b <= 87.5:
+            hs, att_h = 9216, 96
+        elif model_size_in_b <= 165.5:
+            hs, att_h = 9216, 96
+        elif model_size_in_b <= 250.5:
+            hs, att_h = 12288, 96
+        else:
+            raise ValueError("Model_size for BERT must be smaller than 25B parameters.")
+        ffn = 4 * hs
+    else:
+        raise NotImplementedError("Model name is not valid.")
+
+    # Try powers of 2
+    margin = 0.01
+    for attempt in range(0, 10):
+        for layers in (2**p for p in range(1, 10)):
+            out_size = _calculate_model_size(
+                vocab_size=vocab_size,
+                seq_length=seq_length,
+                hidden_size=hs,
+                num_layers=layers,
+                ffn_size=ffn,
+                kv_channels=kv,
+                att_heads=att_h,
+                model_name=model_name,
+            )
+            if model_size_in_b * (1.0 - margin) < out_size < model_size_in_b * (1.0 + margin):
+                return layers, hs, att_h, ffn, kv, lr
+        margin += 0.01  # Double margin of acceptable model sizes.
+
+    # Try multiples of 16
+    margin = 0.01
+    for attempt in range(0, 6):
+        for layers in range(16, 201, 16):
+            out_size = _calculate_model_size(
+                vocab_size=vocab_size,
+                seq_length=seq_length,
+                hidden_size=hs,
+                num_layers=layers,
+                ffn_size=ffn,
+                kv_channels=kv,
+                att_heads=att_h,
+                model_name=model_name,
+            )
+            if model_size_in_b * (1.0 - margin) < out_size < model_size_in_b * (1.0 + margin):
+                return layers, hs, att_h, ffn, kv, lr
+        margin += 0.01  # Double margin of acceptable model sizes.
+
+    # Try multiples of 2
+    margin = 0.01
+    for attempt in range(0, 6):
+        for layers in range(2, 201, 2):
+            out_size = _calculate_model_size(
+                vocab_size=vocab_size,
+                seq_length=seq_length,
+                hidden_size=hs,
+                num_layers=layers,
+                ffn_size=ffn,
+                kv_channels=kv,
+                att_heads=att_h,
+                model_name=model_name,
+            )
+            if model_size_in_b * (1.0 - margin) < out_size < model_size_in_b * (1.0 + margin):
+                return layers, hs, att_h, ffn, kv, lr
+        margin += 0.01  # Double margin of acceptable model sizes.
+
+    # Try multiples of 5
+    margin = 0.01
+    for attempt in range(0, 6):
+        for layers in range(5, 201, 5):
+            out_size = _calculate_model_size(
+                vocab_size=vocab_size,
+                seq_length=seq_length,
+                hidden_size=hs,
+                num_layers=layers,
+                ffn_size=ffn,
+                kv_channels=kv,
+                att_heads=att_h,
+                model_name=model_name,
+            )
+            if model_size_in_b * (1.0 - margin) < out_size < model_size_in_b * (1.0 + margin):
+                return layers, hs, att_h, ffn, kv, lr
+        margin += 0.01  # Double margin of acceptable model sizes.
+
+    # Try any valid number
+    margin = 0.01
+    for attempt in range(0, 10):
+        for layers in range(1, 200):
+            out_size = _calculate_model_size(
+                vocab_size=vocab_size,
+                seq_length=seq_length,
+                hidden_size=hs,
+                num_layers=layers,
+                ffn_size=ffn,
+                kv_channels=kv,
+                att_heads=att_h,
+                model_name=model_name,
+            )
+            if model_size_in_b * (1.0 - margin) < out_size < model_size_in_b * (1.0 + margin):
+                return layers, hs, att_h, ffn, kv, lr
+        margin += 0.01  # Double margin of acceptable model sizes.
+    raise Exception("Number of layers not found, config is not possible.")
+
+
+def generic_base_config(
+    model_name: str = "llama",
+    model_version: int = 2,
+    model_size_in_b: int = 7,
+    model_measure: str = "B",
+    cfg: dict = {},
+) -> dict:
+    """
+    Generates a base config dictionary from a base config python file.
+    :param dict cfg: dict config object for the Auto Configurator tool.
+    :param str model_name: name of the model, i.e. gpt3, t5, mt5...
+    :returns: dictionary containing the base configuration for the model.
+    :rtype: dict
+    """
+    from nemo.collections.llm.tools.auto_configurator.core.base_config import calculate_model_size
+
+    default_model = False if model_size_in_b else True
+    custom_model = True if cfg.get("custom_model") else False
+
+    model_cls = getattr(base_configs, MODULES[model_name])
+
+    model_size_in_b = calculate_model_size(
+        cfg.get("gpu_count"),
+        cfg.get("max_training_days"),
+        model_size_in_b,
+        cfg.get("tflops_per_gpu"),
+        cfg.get("num_tokens_in_b"),
+        model_name,
+    )
+
+    if default_model:
+        model = model_cls(cfg=cfg)
+    elif custom_model:
+        model = base_configs.custom(name=MODULES[model_name], cfg=cfg)
+    else:
+        model = model_cls(version=model_version, size=model_size_in_b, measure=model_measure, cfg=cfg)
+
+    base_cfg = {
+        "model": model.get_model_config(),
+        "optim": model.get_optim_config(),
+        "trainer": model.get_trainer_config(),
+        "data": model.get_data_config(),
+        "run": model.get_run_config(),
+    }
+
+    if default_model:
+        num_layers, hidden_size, num_attention_heads, ffn_hidden_size, kv_channels, _ = calculate_model_size_params(
+            model_size_in_b,
+            cfg.get("vocab_size"),
+            cfg.get("seq_length"),
+            model_name,
+        )
+
+        if model_name in ["gpt3", "llama", "baichuan2", "chatglm", "qwen2", "mixtral", "mistral", "gemma"]:
+            base_cfg["model"].num_layers = num_layers
+            base_cfg["model"].hidden_size = hidden_size
+            base_cfg["model"].num_attention_heads = num_attention_heads
+            base_cfg["model"].kv_channels = kv_channels if kv_channels else None
+            if not ffn_hidden_size:
+                base_cfg["model"].ffn_hidden_size = hidden_size * 4
+            else:
+                base_cfg["model"].ffn_hidden_size = ffn_hidden_size
+
+    cfg["model_size_in_b"] = model_size_in_b
+
+    return base_cfg, cfg
+
+
+def modify_cfg(
+    base_cfg: dict,
+    act: int,
+    num_mbs_act: int,
+    act_per_pipe: int,
+    tp: int,
+    pp: int,
+    cp: int,
+    ep: int,
+    virtual_pipelines: int,
+    mbs: int,
+    max_minutes: int,
+    max_steps: int,
+    num_nodes: int,
+    model_name: str,
+) -> dict:
+    """
+    Modify the base configuration for the model with the new parameters that are specific to the current model, which the Auto Configurator tool heuristics selected.
+    :param dict base_cfg: base configuration for the current model, which will be modified in this function.
+    :param int act: number of activation checkpointing layers to use for the model.
+    :param int num_mbs_act: sets the number of micro-batches where only a partial number of Transformer layers get checkpointed and recomputed within a window of micro-batches.
+    :param int act_per_pipe: sets the number of Transformer layers to skip checkpointing at later pipeline stages.
+    :param int tp: Tensor Parallelism (TP) value to be set for the model.
+    :param int pp: Pipeline Parallelism (PP) value to be set for the model.
+    :param int cp: Context Parallelism (CP) value to be set for the model.
+    :param int ep: Expert Parallelism (EP) value to be set for the model.
+    :param int virtual_pipelines: Virtual Pipelines value to be set for the model.
+    :param int mbs: Micro Batch Size (MBS) value to be set for the model.
+    :param int max_minutes: maximum amount of time to run this model for.
+    :param int max_steps: maximum number of steps to run this model for.
+    :param int num_nodes: number of nodes to use for the training run.
+    :param str model_name: name of the model, i.e. gpt3, t5, mt5...
+    :return: dictionary containing the updated model configuration parameters.
+    :rtype: dict
+    """
+    new_cfg = copy.deepcopy(base_cfg)
+    if act is not None:
+        if model_name in [
+            "gpt3",
+            "bert",
+            "llama",
+            "baichuan2",
+            "chatglm",
+            "qwen2",
+            "mixtral",
+            "mistral",
+            "gemma",
+        ]:
+            new_cfg["auto_config"]["activations_checkpoint_num_layers"] = act
+        else:
+            new_cfg["auto_config"]["encoder"]["activations_checkpoint_num_layers"] = act // 2
+            new_cfg["auto_config"]["decoder"]["activations_checkpoint_num_layers"] = act // 2
+
+    if num_mbs_act is not None and model_name in [
+        "gpt3",
+        "bert",
+        "llama",
+        "baichuan2",
+        "chatglm",
+        "qwen2",
+        "mixtral",
+        "mistral",
+        "gemma",
+    ]:
+        new_cfg["auto_config"]["num_micro_batches_with_partial_activation_checkpoints"] = num_mbs_act
+
+    if act_per_pipe is not None and model_name in [
+        "gpt3",
+        "bert",
+        "llama",
+        "baichuan2",
+        "chatglm",
+        "qwen2",
+        "mixtral",
+        "mistral",
+        "gemma",
+    ]:
+        new_cfg["auto_config"]["activations_checkpoint_layers_per_pipeline"] = act_per_pipe
+
+    if virtual_pipelines is not None and model_name in [
+        "gpt3",
+        "bert",
+        "llama",
+        "baichuan2",
+        "chatglm",
+        "qwen2",
+        "mixtral",
+        "mistral",
+        "gemma",
+    ]:
+        new_cfg["auto_config"]["virtual_pipeline_model_parallel_size"] = virtual_pipelines
+
+    new_cfg["auto_config"]["tensor_model_parallel_size"] = tp
+    new_cfg["auto_config"]["pipeline_model_parallel_size"] = pp
+    new_cfg["auto_config"]["micro_batch_size"] = mbs
+    new_cfg["data"]["micro_batch_size"] = mbs
+
+    if cp is not None:
+        new_cfg["auto_config"]["context_parallel_size"] = cp
+
+    if ep is not None:
+        new_cfg["auto_config"]["expert_model_parallel_size"] = ep
+
+    if model_name in [
+        "gpt3",
+        "bert",
+        "llama",
+        "baichuan2",
+        "chatglm",
+        "qwen2",
+        "mixtral",
+        "mistral",
+        "gemma",
+    ]:
+        att_heads = new_cfg["model"].num_attention_heads
+        num_layers = new_cfg["model"].num_layers
+    else:
+        att_heads = new_cfg["model"].encoder.num_attention_heads
+        num_layers = new_cfg["model"].encoder.num_layers
+
+    # gbs = mbs * num_gpus * accumulate_grad_batches / (tp * pp)
+    num_gpus = new_cfg["trainer"]["num_nodes"] * new_cfg["trainer"]["devices"]
+    gbs = new_cfg["model"].global_batch_size
+    new_cfg["data"]["global_batch_size"] = gbs
+    seq_len = new_cfg["model"].seq_length
+
+    mod_gbs = gbs % (mbs * num_gpus / (tp * pp))
+    mod_att_heads = att_heads % tp
+    mod_layers = num_layers % pp
+    if mod_gbs == 0 and mod_att_heads == 0 and mod_layers == 0:
+        # Valid config
+        new_cfg["trainer"]["num_nodes"] = num_nodes  # Necessary for short single-node test.
+        new_cfg["trainer"]["max_steps"] = max_steps
+        new_cfg["trainer"]["val_check_interval"] = max_steps
+        days = max_minutes // 3600
+        hours = (max_minutes % 3600) // 60
+        mins = (max_minutes % 3600) % 60
+        new_cfg["run"]["time_limit"] = f"{days}-{hours}:{mins}:00"
+        new_cfg["run"][
+            "name"
+        ] = f"{new_cfg['run']['name']}_{num_nodes}nodes_tp_{tp}_pp_{pp}_cp_{cp}_ep_{ep}_mbs_{mbs}_act_ckpt_{act}_num_mbs_act_{num_mbs_act}_act_per_pipe_{act_per_pipe}"
+        print(
+            f"Valid config: SeqLen={seq_len}, GBS={gbs}, MBS={mbs}, TP={tp}, PP={pp}, CP={cp}, EP={ep}, act_ckpt_layers={act}, num_mbs_act={num_mbs_act}, act_per_pipe={act_per_pipe}. Adding to directory."
+        )
+        return new_cfg
+    return None

From cf50b14fe12f173f2872c221f6ae957c5ea9ef45 Mon Sep 17 00:00:00 2001
From: dimapihtar <dimapihtar@users.noreply.github.com>
Date: Tue, 27 Aug 2024 13:23:16 +0000
Subject: [PATCH 03/63] Apply isort and black reformatting

Signed-off-by: dimapihtar <dimapihtar@users.noreply.github.com>
---
 .../llm/tools/auto_configurator/core/training_config.py       | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/nemo/collections/llm/tools/auto_configurator/core/training_config.py b/nemo/collections/llm/tools/auto_configurator/core/training_config.py
index 412766cc9f2c..56fc56e2feea 100644
--- a/nemo/collections/llm/tools/auto_configurator/core/training_config.py
+++ b/nemo/collections/llm/tools/auto_configurator/core/training_config.py
@@ -24,7 +24,7 @@
 
 
 def search_training_config(
-        base_cfg: dict,
+    base_cfg: dict,
     train_cfg: dict,
 ) -> None:
     """
@@ -41,7 +41,7 @@ def search_training_config(
 
 
 def generate_grid_search_configs(
-        base_cfg: dict,
+    base_cfg: dict,
     train_cfg: dict,
 ) -> Tuple[str, List[int], int]:
     """

From e89ed2582904233f325692d00e60debdbd0e0dbf Mon Sep 17 00:00:00 2001
From: dimapihtar <dpihtar@gmail.com>
Date: Tue, 27 Aug 2024 06:23:09 -0700
Subject: [PATCH 04/63] add runner

Signed-off-by: dimapihtar <dpihtar@gmail.com>
---
 .../llm/tools/auto_configurator/runner.py     | 303 ++++++++++++++++++
 1 file changed, 303 insertions(+)
 create mode 100644 nemo/collections/llm/tools/auto_configurator/runner.py

diff --git a/nemo/collections/llm/tools/auto_configurator/runner.py b/nemo/collections/llm/tools/auto_configurator/runner.py
new file mode 100644
index 000000000000..072f23d0d285
--- /dev/null
+++ b/nemo/collections/llm/tools/auto_configurator/runner.py
@@ -0,0 +1,303 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the License);
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import List, Optional
+
+import torch
+from pytorch_lightning.loggers import TensorBoardLogger
+
+from nemo import lightning as nl
+from nemo.collections.common.tokenizers import AutoTokenizer, SentencePieceTokenizer
+from nemo.collections.llm import GPTModel, PreTrainingDataModule
+from nemo.collections.llm.api import pretrain
+from nemo.collections.llm.tools.auto_configurator.core.search_config import search_configs
+from nemo.collections.llm.utils import Config, Partial
+from nemo.lightning.pytorch.optim import CosineAnnealingScheduler, MegatronOptimizerModule
+from nemo.utils import logging
+from nemo.utils.exp_manager import TimingCallback
+
+SUPPORTED_MODELS = [
+    "gpt3",
+    "llama",
+    "mixtral",
+    "mistral",
+    "gemma",
+]
+
+SUPPORTED_TOKENIZERS = [
+    "autotokenizer",
+    "sentencepiece",
+    "huggingface",
+]
+
+
+class AutoConfigurator:
+    """Auto Configurator runner config class."""
+
+    def __init__(
+        self,
+        model_type: str = None,
+        num_nodes: int = None,
+        data_paths: List = None,
+        path_to_logs: Optional[str] = None,
+        tokenizer_type: Optional[str] = "autotokenizer",
+        tokenizer_path: Optional[str] = "GPT2BPETokenizer",
+        model_size: Optional[int] = None,
+        model_version: Optional[int] = None,
+        gpus_per_node: Optional[int] = 8,
+        gpu_memory_gb: Optional[int] = 80,
+        model_measure: Optional[str] = "B",
+        seq_length: Optional[int] = 2048,
+        global_batch_size: Optional[int] = "auto",
+        tensor_parallel_sizes: Optional[List[int]] = "auto",
+        pipeline_parallel_sizes: Optional[List[int]] = "auto",
+        micro_batch_sizes: Optional[List[int]] = "auto",
+        context_parallel_sizes: Optional[List[int]] = [1],
+        expert_parallel_sizes: Optional[List[int]] = [1],
+        min_model_parallel_size: Optional[int] = "auto",
+        max_model_parallel_size: Optional[int] = "auto",
+        num_tokens_in_b: Optional[int] = 300,
+        tflops_per_gpu: Optional[int] = 140,
+        max_minutes_per_run: Optional[int] = 30,
+        max_training_days: Optional[int] = 2,
+        max_steps_per_run: Optional[int] = 50,
+        vocab_size: Optional[int] = 51200,
+        model_args: Optional[dict] = {},
+        custom_model: Optional[bool] = False,
+        nemo_run: Optional[bool] = False,
+    ):
+        """
+        :param str model_type: model type to be used for training.
+        :param int num_nodes: number of nodes to be used for training.
+        :param List data_paths: list of datafiles to be used for training.
+        :param str path_to_logs: path to the directory where the logs will be stored.
+        :param Optional[str] tokenizer_type: tokenizer type.
+        :param Optional[str] tokenizer_path: path to the tokenizer model.
+        :param Optional[int] model_size: size of model to be trained.
+        :param Optional[int] model_version: version of model. 3 for GPT3, 2 for Llama2.
+        :param Optional[int] gpus_per_node: number of GPUs per node to be used.
+        :param Optional[int] gpu_memory_gb: memory per GPU, in GB. Currently 40GB and 80GB A100s/H100s supported.
+        :param Optional[str] model_measure: "M" if model_size is specified in millions. "B" if in billions.
+        :param Optional[int] seq_length: model sequence length. Available seq_length list for GPT-based models: [2048, 4096, 8192, 16384, 32768].
+        :param Optional[int] global_batch_size: model global batch size. Set to "auto" if you want auto configurator to find optimal gbs.
+        :param Optional[List[int]] tensor_parallel_sizes: set to "auto" to use our recommendation, or a list, such as [1, 2, 4, 8].
+        :param Optional[List[int]] pipeline_parallel_sizes: set to "auto" to use our recommendation, or a list, such as [1, 2, 4, 8].
+        :param Optional[List[int]] micro_batch_sizes: set to "auto" to use our recommendation, or a list, such as [1, 2, 4, 8].
+        :param Optional[List[int]] context_parallel_sizes: model context parallel size. A list, such as [1, 2, 4, 8].
+        :param Optional[List[int]] expert_parallel_sizes: model expert parallel size. A list, such as [1, 2, 4, 8].
+        :param Optional[int] min_model_parallel_size: set to "auto" to use our recommendation, or a value for the minimum desired parallelism.
+        :param Optional[int] max_model_parallel_size: set to "auto" to use our recommendation, or a value for the maximum desired parallelism.
+        :param Optional[int] num_tokens_in_b: number of tokens in billions in train dataset.
+        :param Optional[int] tflops_per_gpu: estimated tflops per GPU.
+        :param Optional[int] max_minutes_per_run: maximum number of minutes per run for the grid search.
+        :param Optional[int] max_training_days: number of days expected model to be trained.
+        :param Optional[int] max_steps_per_run: maximum number of steps per run for the grid search.
+        :param Optional[int] vocab_size: size of tokenizer vocabulary.
+        :param Optional[dict] model_args: additional args to add to mdoel config.
+        :param Optional[bool] custom_model: set to True if you want to use custom model.
+        :param Optional[bool] nemo_sdk: set to True if you want to run Auto Configurator with nemo-sdk.
+        """
+
+        assert model_type in SUPPORTED_MODELS, f"model_type must be set to one of {SUPPORTED_MODELS}."
+        assert tokenizer_type in SUPPORTED_TOKENIZERS, f"tokenizer_type must be set to one of {SUPPORTED_TOKENIZERS}."
+        assert num_nodes, "num_nodes value must be specified."
+        assert data_paths, "training data must be specified."
+        if nemo_run:
+            assert path_to_logs, f"path_to_logs parameter must be specified."
+
+        self.config = locals()
+        self.config.pop('self')
+
+        # Print the config
+        logging.info(self._get_message(self.config))
+
+    def generate_configs(self) -> dict:
+        """
+        :return: dictionary of generated configs.
+            key: model config name, type: str.
+            value: model config values, type: dict.
+        :rtype: dict.
+        """
+
+        configs = search_configs(self.config)
+        if self.config["nemo_run"]:
+            configs = self._generate_nemo_run_configs(
+                configs,
+                self.config["tokenizer_type"],
+                self.config["tokenizer_path"],
+                self.config["path_to_logs"],
+            )
+
+        return configs
+
+    def _generate_nemo_run_configs(
+        self,
+        configs: dict,
+        tokenizer_type: str,
+        tokenizer_path: str,
+        path_to_logs: str,
+    ) -> dict:
+        """
+        Function that returns a dictionary of Partial configs.
+        :param: dict config: runner config.
+        :param: str tokenizer_type: tokenizer type.
+        :param: str tokenizer_path: path to the tokenizer.
+        :param: str path_to_logs: path to logs directory.
+        :return: dictionary of Partial configs.
+        :rtype: dict.
+        """
+
+        tokenizer = self._get_tokenizer(tokenizer_type, tokenizer_path)
+        for name, config in configs.items():
+            strategy = self._get_startegy(config['auto_config'])
+            configs[name] = Partial(
+                pretrain,
+                model=self._get_model(config['model'], tokenizer),
+                trainer=self._get_trainer(config['trainer'], strategy),
+                data=self._get_data(config['data'], tokenizer),
+                optim=self._get_optim(config['optim']),
+                log=self._get_logger(name, path_to_logs),
+                resume=None,
+            )
+
+        return configs
+
+    def _get_model(self, model_config, tokenizer):
+        return GPTModel(model_config, tokenizer=tokenizer)
+
+    def _get_tokenizer(self, tokenizer_type: str, tokenizer_path: str) -> Config:
+        """
+        Function that returns the tokenizer config.
+        :param: str tokenizer_type: tokenizer type.
+        :param: str tokenizer_path: path to the tokenizer.
+        :return: tokenizer config.
+        :rtype: Config.
+        """
+
+        if tokenizer_type == "sentencepiece":
+            return Config(SentencePieceTokenizer, model_path=tokenizer_path)
+        else:
+            return Config(AutoTokenizer, pretrained_model_name=tokenizer_path)
+
+    def _get_data(self, data_config: dict, tokenizer_config: Config) -> Config:
+        """
+        Function that returns the data module.
+        :param: Config tokenizer: tokenizer config.
+        :return: data module.
+        :rtype: Config.
+        """
+
+        return Config(
+            PreTrainingDataModule,
+            **data_config,
+            tokenizer=tokenizer_config,
+        )
+
+    def _get_optim(self, optim_config: Config) -> Config:
+        """
+        Function that returns the optimizer.
+        :param: Config optim_config: optimizer config.
+        :return: optimizer.
+        :rtype: Config.
+        """
+
+        sched = Config(
+            CosineAnnealingScheduler,
+            warmup_steps=10,
+            constant_steps=0,
+            min_lr=optim_config.min_lr,
+        )
+
+        return Config(
+            MegatronOptimizerModule,
+            config=optim_config,
+            lr_scheduler=sched,
+        )
+
+    def _get_trainer(self, trainer_config: dict, strategy: Config) -> Config:
+        """
+        Function that returns the trainer.
+        :param: dict trainer_config: trainer config.
+        :param: Config strategy: training strategy.
+        :return: trainer.
+        :rtype: Config.
+        """
+
+        return Config(
+            nl.Trainer,
+            **trainer_config,
+            strategy=strategy,
+            plugins=Config(nl.MegatronMixedPrecision, precision="bf16-mixed"),
+            callbacks=[Config(TimingCallback)],
+        )
+
+    def _get_startegy(self, auto_config: dict) -> Config:
+        """
+        Function that returns the training strategy.
+        :param: dict auto_config: model parallelism config.
+        :return: training strategy.
+        :rtype: Config.
+        """
+
+        return Config(
+            nl.MegatronStrategy,
+            pipeline_dtype=torch.bfloat16,
+            tensor_model_parallel_size=auto_config.get('tensor_model_parallel_size', 1),
+            pipeline_model_parallel_size=auto_config.get('pipeline_model_parallel_size', 1),
+            virtual_pipeline_model_parallel_size=auto_config.get('virtual_pipeline_model_parallel_size', None),
+            context_parallel_size=auto_config.get('context_parallel_size', 1),
+            expert_model_parallel_size=auto_config.get('expert_model_parallel_size', 1),
+        )
+
+    def _get_logger(self, run_name: str, path_to_logs: str) -> Config:
+        """
+        Function that returns the training strategy.
+        :param: str run_name: name of run.
+        :param: str path_to_logs: path to logs directory.
+        :return: training logger.
+        :rtype: Config.
+        """
+
+        tb_logger = Config(TensorBoardLogger, save_dir=path_to_logs)
+
+        ckpt = Config(
+            nl.ModelCheckpoint,
+            monitor="reduced_train_loss",
+            save_best_model=False,
+            save_last=False,
+            save_top_k=0,
+        )
+
+        return Config(
+            nl.NeMoLogger,
+            ckpt=ckpt,
+            name=run_name,
+            tensorboard=tb_logger,
+            wandb=None,
+            dir=path_to_logs,
+        )
+
+    def _get_message(self, config: dict) -> str:
+        """
+        Function that returns runner config line by line.
+        :param: dict config: runner config.
+        :return: runner config params.
+        :rtype: str.
+        """
+
+        message = "AutoConfigurator runner config:\n"
+        for key, value in config.items():
+            message += f"{key}: {value}\n"
+
+        return message

From 0a2a6a0696b0c3331400ff19e2ee44b7cc6e463a Mon Sep 17 00:00:00 2001
From: dimapihtar <dpihtar@gmail.com>
Date: Tue, 27 Aug 2024 06:32:04 -0700
Subject: [PATCH 05/63] add end-to-end example for auto configurator

Signed-off-by: dimapihtar <dpihtar@gmail.com>
---
 examples/llm/auto_configurator/auto_config.py | 175 ++++++++++++++++++
 1 file changed, 175 insertions(+)
 create mode 100644 examples/llm/auto_configurator/auto_config.py

diff --git a/examples/llm/auto_configurator/auto_config.py b/examples/llm/auto_configurator/auto_config.py
new file mode 100644
index 000000000000..071f193a6cd5
--- /dev/null
+++ b/examples/llm/auto_configurator/auto_config.py
@@ -0,0 +1,175 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import os
+import shutil
+
+import torch
+from pytorch_lightning.loggers import TensorBoardLogger
+
+from nemo import lightning as nl
+from nemo.collections import llm
+from nemo.collections.llm.api import train
+from nemo.collections.llm.gpt.data import PreTrainingDataModule
+from nemo.collections.llm.tools.auto_configurator import AutoConfigurator, get_results
+from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer
+from nemo.lightning import NeMoLogger
+from nemo.lightning.pytorch.callbacks import ModelCheckpoint
+from nemo.lightning.pytorch.optim.megatron import MegatronOptimizerModule
+from nemo.utils.exp_manager import TimingCallback
+
+
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--run_number", type=int, help="Number of config to run")
+    parser.add_argument("--logs_dir", type=str, help="Path where to save training logs")
+    parser.add_argument("--data_path", type=str, help="Path to the dataset")
+    parser.add_argument("--get_results", action="store_true")
+
+    return parser.parse_args()
+
+
+def train_config(args):
+    # GPT-3 126M
+    # This example will generate 3 configs.
+    # It is expected that this script will be run 3 times with changing --run_number flag for each run from 0 to 2.
+    # After all configurations are trained, please trigger the script using --get_results flag.
+    runner = AutoConfigurator(
+        model_type="gpt3",
+        model_version=3,
+        model_size=126,
+        model_measure="M",
+        num_nodes=1,
+        gpus_per_node=1,
+        gpu_memory_gb=40,
+        global_batch_size=16,
+        seq_length=512,
+        tensor_parallel_sizes=[1],
+        pipeline_parallel_sizes=[1],
+        micro_batch_sizes=[1, 2, 4],
+        max_training_days=1,
+        max_steps_per_run=25,
+        num_tokens_in_b=10,
+        vocab_size=51200,
+        data_paths=args.data_path,
+    )
+
+    # Get generated configs
+    configs = runner.generate_configs()
+
+    tokenizer = get_nmt_tokenizer(
+        "megatron",
+        "GPT2BPETokenizer",
+    )
+
+    # Define candidate to run
+    runs = list(configs.keys())
+    run_name = runs[args.run_number]
+    config = configs[run_name]
+
+    # Define data
+    config["data"].pop('split')
+    data = PreTrainingDataModule(**config["data"], split="900,50,50", tokenizer=tokenizer)
+
+    # Define model
+    config["model"].tensor_model_parallel_size = config["auto_config"].get("tensor_model_parallel_size")
+    config["model"].pipeline_model_parallel_size = config["auto_config"].get("pipeline_model_parallel_size")
+    config["model"].context_parallel_size = config["auto_config"].get("context_parallel_size")
+    config["model"].expert_model_parallel_size = config["auto_config"].get("expert_model_parallel_size")
+    model = llm.GPTModel(config["model"], tokenizer=data.tokenizer)
+
+    # Define optimizer
+    opt = MegatronOptimizerModule(config=config["optim"])
+
+    # Define strategy
+    strategy = nl.MegatronStrategy(
+        pipeline_dtype=torch.bfloat16,
+        tensor_model_parallel_size=config["auto_config"].get("tensor_model_parallel_size"),
+        pipeline_model_parallel_size=config["auto_config"].get("pipeline_model_parallel_size"),
+        virtual_pipeline_model_parallel_size=config["auto_config"].get("virtual_pipeline_model_parallel_size", None),
+        context_parallel_size=config["auto_config"].get("context_parallel_size"),
+        expert_model_parallel_size=config["auto_config"].get("expert_model_parallel_size"),
+    )
+
+    # Define TensorBoard logger
+    tensorboard_logger = TensorBoardLogger(
+        save_dir=f"{args.logs_dir}/{run_name}",
+    )
+
+    # Define trainer
+    trainer = nl.Trainer(
+        **config["trainer"],
+        strategy=strategy,
+        plugins=nl.MegatronMixedPrecision(precision="bf16-mixed"),
+        callbacks=[TimingCallback()],
+        logger=tensorboard_logger,
+    )
+
+    # Define logger
+    nemo_logger = NeMoLogger(
+        dir=f"{args.logs_dir}/{run_name}",
+    )
+
+    # Train candidate
+    train(
+        model=model,
+        data=data,
+        trainer=trainer,
+        log=nemo_logger,
+        tokenizer="data",
+        optim=opt,
+    )
+
+
+def main():
+    args = get_args()
+
+    if not args.get_results:
+        train_config(args)
+
+    else:
+        # Get Auto Configurator results
+        candidates = [d for d in os.listdir(args.logs_dir) if os.path.isdir(os.path.join(args.logs_dir, d))]
+        for subdir in candidates:
+            default_dir = os.path.join(args.logs_dir, subdir, "default")
+            if os.path.exists(default_dir) and os.path.isdir(default_dir):
+                for item in os.listdir(default_dir):
+                    s = os.path.join(default_dir, item)
+                    d = os.path.join(args.logs_dir, subdir, item)
+                    shutil.move(s, d)
+
+                os.rmdir(default_dir)
+
+        get_results(
+            training_logs=args.logs_dir,
+            path_to_save=args.logs_dir,
+            model_name="gpt3",
+            model_version=3,
+            model_size=126,
+            model_measure="M",
+            num_nodes=1,
+            gpus_per_node=1,
+            global_batch_size=16,
+            seq_length=512,
+            max_training_days=1,
+            num_tokens_in_b=10,
+            vocab_size=51200,
+        )
+
+        print(f"The results were successfully saved to {args.logs_dir}.")
+
+
+if __name__ == '__main__':
+    main()

From b0d8478e2b6236c3e19e49e49843ad6cb6e67737 Mon Sep 17 00:00:00 2001
From: dimapihtar <dpihtar@gmail.com>
Date: Tue, 27 Aug 2024 06:34:21 -0700
Subject: [PATCH 06/63] add unit tests for auto configurator

Signed-off-by: dimapihtar <dpihtar@gmail.com>
---
 .../llm/auto_conf/test_base_configs.py        | 264 +++++++++++
 .../llm/auto_conf/test_generate_configs.py    | 446 ++++++++++++++++++
 tests/collections/llm/auto_conf/test_utils.py | 155 ++++++
 3 files changed, 865 insertions(+)
 create mode 100644 tests/collections/llm/auto_conf/test_base_configs.py
 create mode 100644 tests/collections/llm/auto_conf/test_generate_configs.py
 create mode 100644 tests/collections/llm/auto_conf/test_utils.py

diff --git a/tests/collections/llm/auto_conf/test_base_configs.py b/tests/collections/llm/auto_conf/test_base_configs.py
new file mode 100644
index 000000000000..847e33cd8ba5
--- /dev/null
+++ b/tests/collections/llm/auto_conf/test_base_configs.py
@@ -0,0 +1,264 @@
+import re
+
+from megatron.core.optimizer import OptimizerConfig
+
+from nemo.collections.llm.tools.auto_configurator import base_configs
+from nemo.collections.llm.utils import Config
+
+
+def get_class_name(config_cls):
+    match = re.search(r'<Config\[(\w+)\(', repr(config_cls))
+    config_cls_name = None
+    if match:
+        config_cls_name = match.group(1)
+
+    return config_cls_name
+
+
+class TestBaseConfigs:
+    def test_gpt3_base_config(self):
+        model_cls = getattr(base_configs, "GPT")
+
+        # GPT3 126M
+        model_126m = model_cls(size=126, measure="M", cfg={"nemo_sdk": True})
+        config_cls = model_126m.get_model_config()
+        config_cls_name = get_class_name(config_cls)
+        assert (
+            config_cls_name == "GPTConfig126M"
+        ), "the name of the config class for the GPT3 126M model should be 'GPTConfig126M'."
+
+        # GPT3 5B
+        model_5b = model_cls(size=5)
+        config_cls = model_5b.get_model_config()
+        assert (
+            config_cls.__class__.__name__ == "GPTConfig5B"
+        ), "the name of the config class for the GPT3 5B model should be 'GPTConfig5B'."
+
+        # GPT3 7B
+        model_7b = model_cls(size=7, cfg={"nemo_sdk": True})
+        config_cls = model_7b.get_model_config()
+        config_cls_name = get_class_name(config_cls)
+        assert (
+            config_cls_name == "GPTConfig7B"
+        ), "the name of the config class for the GPT3 7B model should be 'GPTConfig7B'."
+
+        # GPT3 20B
+        model_20b = model_cls(size=20)
+        config_cls = model_20b.get_model_config()
+        assert (
+            config_cls.__class__.__name__ == "GPTConfig20B"
+        ), "the name of the config class for the GPT3 20B model should be 'GPTConfig20B'."
+
+        # GPT3 40B
+        model_40b = model_cls(size=40)
+        config_cls = model_40b.get_model_config()
+        assert (
+            config_cls.__class__.__name__ == "GPTConfig40B"
+        ), "the name of the config class for the GPT3 40B model should be 'GPTConfig40B'."
+
+        # GPT3 175B
+        model_175b = model_cls(size=175, cfg={"nemo_sdk": True})
+        config_cls = model_175b.get_model_config()
+        config_cls_name = get_class_name(config_cls)
+        assert (
+            config_cls_name == "GPTConfig175B"
+        ), "the name of the config class for the GPT3 175B model should be 'GPTConfig175B'."
+
+        try:
+            model_111b = model_cls(size=111)
+            config_cls = model_111b.get_model_config()
+            config_cls_name = get_class_name(config_cls)
+            assert (
+                config_cls_name == "GPTConfig111B"
+            ), "the name of the config class for the GPT3 111B model should be 'GPTConfig111B'."
+        except AttributeError:
+            None
+
+    def test_llama_base_config(self):
+        model_cls = getattr(base_configs, "Llama")
+
+        # Llama2_7B
+        model_7b = model_cls(size=7, cfg={"nemo_sdk": True})
+        config_cls = model_7b.get_model_config()
+        config_cls_name = get_class_name(config_cls)
+        assert (
+            config_cls_name == "Llama2Config7B"
+        ), "the name of the config class for the Llama2 7B model should be 'Llama2Config7B'."
+
+        # Llama2_13B
+        model_13b = model_cls(size=13)
+        config_cls = model_13b.get_model_config()
+        assert (
+            config_cls.__class__.__name__ == "Llama2Config13B"
+        ), "the name of the config class for the Llama2 13B model should be 'Llama2Config13B'."
+
+        # Llama2_70B
+        model_70b = model_cls(size=70)
+        config_cls = model_70b.get_model_config()
+        assert (
+            config_cls.__class__.__name__ == "Llama2Config70B"
+        ), "the name of the config class for the Llama2 70B model should be 'Llama2Config70B'."
+
+        # Llama3_70B
+        model_70b = model_cls(size=70, version=3)
+        config_cls = model_70b.get_model_config()
+        assert (
+            config_cls.__class__.__name__ == "Llama3Config70B"
+        ), "the name of the config class for the Llama3 70B model should be 'Llama3Config70B'."
+
+        # Llama3_8B
+        model_8b = model_cls(size=8, version=3, cfg={"nemo_sdk": True})
+        config_cls = model_8b.get_model_config()
+        config_cls_name = get_class_name(config_cls)
+        assert (
+            config_cls_name == "Llama3Config8B"
+        ), "the name of the config class for the Llama3 8B model should be 'Llama3Config8B'."
+
+    def test_mixtral_base_config(self):
+        model_cls = getattr(base_configs, "Mixtral")
+
+        # Mixtral 8x7B
+        model_7b = model_cls(size=7)
+        config_cls = model_7b.get_model_config()
+        assert (
+            config_cls.__class__.__name__ == "MixtralConfig8x7B"
+        ), "the name of the config class for the Mixtral 8x7B model should be 'MixtralConfig8x7B'."
+
+    def test_mistral_base_config(self):
+        model_cls = getattr(base_configs, "Mistral")
+
+        # Mistral 7B
+        model_7b = model_cls(size=7, cfg={"nemo_sdk": True})
+        config_cls = model_7b.get_model_config()
+        config_cls_name = get_class_name(config_cls)
+        assert (
+            config_cls_name == "MistralConfig7B"
+        ), "the name of the config class for the Mistral 7B model should be 'MistralConfig7B'."
+
+    def test_basic_base_config(self):
+        model_cls = getattr(base_configs.basic, "Basic")
+
+        # Basic model class
+        model = model_cls(measure="M")
+
+        assert model.name == None
+        assert model.version == None
+        assert model.size == None
+        assert model.measure == "M"
+        assert model.cfg == {}
+
+    def test_custom_base_config(self):
+        model = base_configs.custom(name="Llama", cfg={})
+
+        assert model.name == "Llama"
+        assert model.version == 2
+        assert model.size == 7
+        assert model.measure == "B"
+        assert model.cfg == {}
+
+    def test_trainer_config(self):
+        model_cls = getattr(base_configs, "GPT")
+
+        model_126m = model_cls(size=126, measure="M")
+        trainer_config_source = model_126m.get_trainer_config()
+
+        trainer_config_target = {
+            "accelerator": "gpu",
+            "logger": False,
+            "enable_checkpointing": False,
+            "use_distributed_sampler": False,
+            "max_epochs": None,
+            "log_every_n_steps": 1,
+            "limit_val_batches": 1,
+            "limit_test_batches": 1,
+            "accumulate_grad_batches": 1,
+            "gradient_clip_val": 1.0,
+            "num_nodes": None,
+            "devices": None,
+            "max_steps": None,
+            "val_check_interval": None,
+        }
+
+        assert (
+            trainer_config_target == trainer_config_source
+        ), f"{trainer_config_target} is expected trainer config but got {trainer_config_source}"
+
+    def test_data_config(self):
+        model_cls = getattr(base_configs, "Llama")
+
+        model_70b = model_cls(size=70)
+        data_config_source = model_70b.get_data_config()
+
+        data_config_target = {
+            "paths": None,
+            "seq_length": None,
+            "global_batch_size": None,
+            "num_workers": 2,
+            "split": "99990,8,2",
+            "index_mapping_dir": None,
+        }
+
+        assert (
+            data_config_target == data_config_source
+        ), f"{data_config_target} is expected data config but got {data_config_source}"
+
+    def test_optim_config(self):
+        model_cls = getattr(base_configs, "Mixtral")
+
+        model_7b = model_cls(size=7)
+        optim_config_source = model_7b.get_optim_config()
+
+        optim_config_target = OptimizerConfig(
+            optimizer='adam',
+            lr=1e-4,
+            min_lr=1e-5,
+            use_distributed_optimizer=True,
+            bf16=True,
+            adam_beta1=0.9,
+            adam_beta2=0.95,
+            overlap_grad_reduce=False,
+            overlap_param_gather=True,
+        )
+
+        assert (
+            optim_config_target == optim_config_source
+        ), f"{optim_config_target} is expected optim config but got {optim_config_source}"
+
+    def test_optim_config_nemo_sdk(self):
+        model_cls = getattr(base_configs, "Mixtral")
+
+        model_7b = model_cls(size=7, cfg={"nemo_sdk": True})
+        optim_config_source = model_7b.get_optim_config()
+
+        optim_config_target = Config(
+            OptimizerConfig,
+            optimizer='adam',
+            lr=1e-4,
+            min_lr=1e-5,
+            use_distributed_optimizer=True,
+            bf16=True,
+            adam_beta1=0.9,
+            adam_beta2=0.95,
+            overlap_grad_reduce=False,
+            overlap_param_gather=True,
+        )
+
+        assert (
+            optim_config_target == optim_config_source
+        ), f"{optim_config_target} is expected optim config but got {optim_config_source}"
+
+    def test_run_config(self):
+        model_cls = getattr(base_configs, "Mistral")
+
+        model_7b = model_cls(size=7)
+        run_config_source = model_7b.get_run_config()
+
+        run_config_target = {
+            "name": f"Mistral_7B",
+            "results_dir": None,
+            "time_limit": "0-00:30:00",
+        }
+
+        assert (
+            run_config_target == run_config_source
+        ), f"{run_config_target} is expected run config but got {run_config_source}"
diff --git a/tests/collections/llm/auto_conf/test_generate_configs.py b/tests/collections/llm/auto_conf/test_generate_configs.py
new file mode 100644
index 000000000000..5910e3dca39c
--- /dev/null
+++ b/tests/collections/llm/auto_conf/test_generate_configs.py
@@ -0,0 +1,446 @@
+from nemo.collections.llm.tools.auto_configurator import AutoConfigurator
+
+
+def get_auto_config(configs):
+    auto_configs = []
+    for config in configs.values():
+        auto_conf_values = config['auto_config'].values()
+        auto_configs.append(list(auto_conf_values))
+
+    global_batch_size = config['model'].global_batch_size
+    seq_length = config['model'].seq_length
+
+    return auto_configs, global_batch_size, seq_length
+
+
+class TestGenerateConfgis:
+    def test_gpt_model(self):
+        # GPT3 126M
+        runner = AutoConfigurator(
+            model_type="gpt3",
+            model_size=126,
+            model_measure="M",
+            num_nodes=8,
+            seq_length=512,
+            global_batch_size=256,
+            tensor_parallel_sizes=[4],
+            pipeline_parallel_sizes=[2],
+            micro_batch_sizes=[1, 2],
+            context_parallel_sizes=[1],
+            expert_parallel_sizes=[1],
+            min_model_parallel_size=8,
+            max_model_parallel_size=8,
+            data_paths=[""],
+        )
+
+        configs = runner.generate_configs()
+        auto_configs, global_batch_size, seq_length = get_auto_config(configs)
+
+        for run_name, config in configs.items():
+            assert config['data']['micro_batch_size'] == config['auto_config']['micro_batch_size']
+            assert config['data']['seq_length'] == 512
+            assert config['data']['global_batch_size'] == 256
+
+        assert len(auto_configs) == 2, f"{len(auto_configs)} configurations were generated but 2 were expected."
+
+        assert auto_configs[0] == [
+            4,
+            2,
+            1,
+            1,
+            1,
+        ], f"[4, 2, 1, 1, 1] is expected configuration output but got {auto_configs[0]}."
+
+        assert auto_configs[1] == [
+            4,
+            2,
+            2,
+            1,
+            1,
+        ], f"[4, 2, 2, 1, 1] is expected configuration output but got {auto_configs[1]}."
+
+        assert global_batch_size == 256, f"expected global_batch_size is 256 but got {global_batch_size}."
+
+        assert seq_length == 512, f"expected seq_length is 512 but got {seq_length}."
+
+        # GPT3 20B
+        runner = AutoConfigurator(
+            model_type="gpt3",
+            model_size=20,
+            num_nodes=64,
+            seq_length=2048,
+            global_batch_size=2048,
+            micro_batch_sizes=[1],
+            context_parallel_sizes=[1],
+            expert_parallel_sizes=[1],
+            min_model_parallel_size=16,
+            max_model_parallel_size=32,
+            max_training_days=8,
+            data_paths=[""],
+        )
+
+        configs = runner.generate_configs()
+        auto_configs, _, _ = get_auto_config(configs)
+
+        for run_name, config in configs.items():
+            assert config['data']['micro_batch_size'] == config['auto_config']['micro_batch_size']
+            assert config['data']['seq_length'] == 2048
+            assert config['data']['global_batch_size'] == 2048
+
+        assert len(auto_configs) == 1, f"{len(auto_configs)} configurations were generated but 1 were expected."
+
+        assert auto_configs[0] == [
+            11,
+            4,
+            4,
+            1,
+            1,
+            1,
+        ], f"[11, 4, 4, 1, 1, 1] is expected configuration output but got {auto_configs[0]}."
+
+        # GPT3 175B
+        runner = AutoConfigurator(
+            model_type="gpt3",
+            model_size=175,
+            num_nodes=128,
+            seq_length=2048,
+            global_batch_size=2048,
+            context_parallel_sizes=[1],
+            expert_parallel_sizes=[1],
+            min_model_parallel_size=64,
+            max_model_parallel_size=64,
+            max_training_days=16,
+            data_paths=[""],
+        )
+
+        configs = runner.generate_configs()
+        auto_configs, _, _ = get_auto_config(configs)
+
+        for run_name, config in configs.items():
+            assert config['data']['micro_batch_size'] == config['auto_config']['micro_batch_size']
+            assert config['data']['seq_length'] == 2048
+            assert config['data']['global_batch_size'] == 2048
+
+        assert len(auto_configs) == 3, f"{len(auto_configs)} configurations were generated but 3 were expected."
+
+        assert auto_configs[0] == [
+            12,
+            8,
+            8,
+            1,
+            1,
+            1,
+        ], f"[12, 8, 8, 1, 1, 1] is expected configuration output but got {auto_configs[0]}."
+
+        assert auto_configs[1] == [
+            12,
+            8,
+            8,
+            2,
+            1,
+            1,
+        ], f"[12, 8, 8, 2, 1, 1] is expected configuration output but got {auto_configs[1]}."
+
+        assert auto_configs[2] == [
+            12,
+            8,
+            8,
+            4,
+            1,
+            1,
+        ], f"[12, 8, 8, 4, 1, 1] is expected configuration output but got {auto_configs[2]}."
+
+    def test_llama_model(self):
+        # Llama2 7B
+        runner = AutoConfigurator(
+            model_type="llama",
+            model_size=7,
+            model_version=2,
+            num_nodes=16,
+            seq_length=4096,
+            global_batch_size=2048,
+            tensor_parallel_sizes=[1],
+            pipeline_parallel_sizes=[1],
+            micro_batch_sizes=[1],
+            context_parallel_sizes=[1, 2],
+            expert_parallel_sizes=[1],
+            min_model_parallel_size=1,
+            max_model_parallel_size=16,
+            max_training_days=8,
+            data_paths=[""],
+        )
+
+        configs = runner.generate_configs()
+        auto_configs, _, _ = get_auto_config(configs)
+
+        for run_name, config in configs.items():
+            assert config['data']['micro_batch_size'] == config['auto_config']['micro_batch_size']
+            assert config['data']['seq_length'] == 4096
+            assert config['data']['global_batch_size'] == 2048
+
+        assert len(auto_configs) == 2, f"{len(auto_configs)} configurations were generated but 2 were expected."
+
+        assert auto_configs[0] == [
+            1,
+            1,
+            1,
+            1,
+            1,
+        ], f"[1, 1, 1, 1, 1] is expected configuration output but got {auto_configs[0]}."
+
+        assert auto_configs[1] == [
+            1,
+            1,
+            1,
+            2,
+            1,
+        ], f"[1, 1, 1, 2, 1] is expected configuration output but got {auto_configs[1]}."
+
+        # Llama3 8B
+        runner = AutoConfigurator(
+            model_type="llama",
+            model_size=8,
+            model_version=3,
+            num_nodes=16,
+            seq_length=8192,
+            global_batch_size=2048,
+            tensor_parallel_sizes=[2],
+            pipeline_parallel_sizes=[2],
+            micro_batch_sizes=[2],
+            context_parallel_sizes=[2],
+            expert_parallel_sizes=[1, 2, 4],
+            min_model_parallel_size=1,
+            max_model_parallel_size=16,
+            max_training_days=8,
+            data_paths=[""],
+        )
+
+        configs = runner.generate_configs()
+        auto_configs, _, _ = get_auto_config(configs)
+
+        for run_name, config in configs.items():
+            assert config['data']['micro_batch_size'] == config['auto_config']['micro_batch_size']
+            assert config['data']['seq_length'] == 8192
+            assert config['data']['global_batch_size'] == 2048
+
+        assert len(auto_configs) == 1, f"{len(auto_configs)} configurations were generated but 1 were expected."
+
+        assert auto_configs[0] == [
+            2,
+            2,
+            2,
+            2,
+            1,
+        ], f"[2, 2, 2, 2, 1] is expected configuration output but got {auto_configs[0]}."
+
+        # Llama3 70B
+        runner = AutoConfigurator(
+            model_type="llama",
+            model_size=70,
+            model_version=3,
+            num_nodes=64,
+            seq_length=8192,
+            global_batch_size=2048,
+            tensor_parallel_sizes=[1, 2],
+            pipeline_parallel_sizes=[1, 2],
+            micro_batch_sizes=[1],
+            context_parallel_sizes=[2],
+            expert_parallel_sizes=[1, 2, 4],
+            min_model_parallel_size=1,
+            max_model_parallel_size=4,
+            max_training_days=30,
+            data_paths=[""],
+        )
+
+        configs = runner.generate_configs()
+        auto_configs, global_batch_size, seq_length = get_auto_config(configs)
+
+        for run_name, config in configs.items():
+            assert config['data']['micro_batch_size'] == config['auto_config']['micro_batch_size']
+            assert config['data']['seq_length'] == 8192
+            assert config['data']['global_batch_size'] == 2048
+
+        assert len(auto_configs) == 3, f"{len(auto_configs)} configurations were generated but 3 were expected."
+
+        assert auto_configs[0] == [
+            1,
+            1,
+            1,
+            2,
+            1,
+        ], f"[1, 1, 1, 2, 1] is expected configuration output but got {auto_configs[0]}."
+
+        assert auto_configs[1] == [
+            1,
+            2,
+            1,
+            2,
+            1,
+        ], f"[1, 2, 1, 2, 1] is expected configuration output but got {auto_configs[1]}."
+
+        assert auto_configs[2] == [
+            2,
+            1,
+            1,
+            2,
+            1,
+        ], f"[2, 1, 1, 2, 1] is expected configuration output but got {auto_configs[2]}."
+
+        assert global_batch_size == 2048, f"expected global_batch_size is 2048 but got {global_batch_size}."
+
+        assert seq_length == 8192, f"expected seq_length is 8192 but got {seq_length}."
+
+    def test_mixtral_model(self):
+        # Mixtral 8x7B
+        runner = AutoConfigurator(
+            model_type="mixtral",
+            model_size=7,
+            model_version=8,
+            num_nodes=16,
+            seq_length=4096,
+            global_batch_size=2048,
+            tensor_parallel_sizes=[2, 3, 4],
+            micro_batch_sizes=[2],
+            expert_parallel_sizes=[2, 4],
+            data_paths=[""],
+        )
+
+        configs = runner.generate_configs()
+        auto_configs, global_batch_size, seq_length = get_auto_config(configs)
+
+        for run_name, config in configs.items():
+            assert config['data']['micro_batch_size'] == config['auto_config']['micro_batch_size']
+            assert config['data']['seq_length'] == 4096
+            assert config['data']['global_batch_size'] == 2048
+
+        assert len(auto_configs) == 4, f"{len(auto_configs)} configurations were generated but 4 were expected."
+
+        assert auto_configs[0] == [
+            2,
+            1,
+            2,
+            1,
+            2,
+        ], f"[2, 1, 2, 1, 2] is expected configuration output but got {auto_configs[0]}."
+
+        assert auto_configs[1] == [
+            2,
+            1,
+            2,
+            1,
+            4,
+        ], f"[2, 1, 2, 1, 4] is expected configuration output but got {auto_configs[1]}."
+
+        assert auto_configs[2] == [
+            2,
+            2,
+            2,
+            1,
+            2,
+        ], f"[2, 2, 2, 1, 2] is expected configuration output but got {auto_configs[2]}."
+
+        assert auto_configs[3] == [
+            4,
+            1,
+            2,
+            1,
+            2,
+        ], f"[4, 1, 2, 1, 2] is expected configuration output but got {auto_configs[3]}."
+
+        assert global_batch_size == 2048, f"expected global_batch_size is 2048 but got {global_batch_size}."
+
+        assert seq_length == 4096, f"expected seq_length is 4096 but got {seq_length}."
+
+    def test_mistral_model(self):
+        # Mistral 7B
+        runner = AutoConfigurator(
+            model_type="mistral",
+            model_size=7,
+            num_nodes=16,
+            seq_length=16384,
+            global_batch_size=2048,
+            tensor_parallel_sizes=[1, 2, 3],
+            pipeline_parallel_sizes=[2, 11, 17],
+            micro_batch_sizes=[1, 256],
+            expert_parallel_sizes=[2, 13],
+            data_paths=[""],
+        )
+
+        configs = runner.generate_configs()
+        auto_configs, global_batch_size, seq_length = get_auto_config(configs)
+
+        for run_name, config in configs.items():
+            assert config['data']['micro_batch_size'] == config['auto_config']['micro_batch_size']
+            assert config['data']['seq_length'] == 16384
+            assert config['data']['global_batch_size'] == 2048
+
+        assert len(auto_configs) == 2, f"{len(auto_configs)} configurations were generated but 2 were expected."
+
+        assert auto_configs[0] == [
+            1,
+            2,
+            1,
+            1,
+            2,
+        ], f"[1, 2, 1, 1, 2] is expected configuration output but got {auto_configs[0]}."
+
+        assert auto_configs[1] == [
+            2,
+            2,
+            1,
+            1,
+            2,
+        ], f"[2, 2, 1, 1, 2] is expected configuration output but got {auto_configs[1]}."
+
+        assert global_batch_size == 2048, f"expected global_batch_size is 2048 but got {global_batch_size}."
+
+        assert seq_length == 16384, f"expected seq_length is 16384 but got {seq_length}."
+
+    def test_custom_model(self):
+        # Custom 1B
+        runner = AutoConfigurator(
+            model_type="llama",
+            num_nodes=4,
+            seq_length=512,
+            tensor_parallel_sizes=[1, 2],
+            pipeline_parallel_sizes=[2, 4],
+            micro_batch_sizes=[1, 256],
+            context_parallel_sizes=[2, 22],
+            expert_parallel_sizes=[1, 13],
+            min_model_parallel_size=2,
+            max_model_parallel_size=8,
+            vocab_size=32000,
+            max_training_days=7,
+            custom_model=True,
+            data_paths=[""],
+        )
+
+        configs = runner.generate_configs()
+        auto_configs, global_batch_size, seq_length = get_auto_config(configs)
+
+        for run_name, config in configs.items():
+            assert config['data']['micro_batch_size'] == config['auto_config']['micro_batch_size']
+            assert config['data']['seq_length'] == 512
+            assert config['data']['global_batch_size'] == 1024
+
+        assert len(auto_configs) == 2, f"{len(auto_configs)} configurations were generated but 2 were expected."
+        print(auto_configs)
+        assert auto_configs[0] == [
+            1,
+            2,
+            1,
+            2,
+            1,
+        ], f"[1, 2, 1, 2, 1] is expected configuration output but got {auto_configs[0]}."
+
+        assert auto_configs[1] == [
+            2,
+            2,
+            1,
+            2,
+            1,
+        ], f"[2, 2, 1, 2, 1] is expected configuration output but got {auto_configs[1]}."
+
+        assert global_batch_size == 1024, f"expected global_batch_size is 1024 but got {global_batch_size}."
+
+        assert seq_length == 512, f"expected seq_length is 512 but got {seq_length}."
diff --git a/tests/collections/llm/auto_conf/test_utils.py b/tests/collections/llm/auto_conf/test_utils.py
new file mode 100644
index 000000000000..6809b2a84cd6
--- /dev/null
+++ b/tests/collections/llm/auto_conf/test_utils.py
@@ -0,0 +1,155 @@
+from nemo.collections.llm.tools.auto_configurator.core.base_config import _estimate_training_time, calculate_model_size
+from nemo.collections.llm.tools.auto_configurator.core.utils import calculate_model_size_params
+
+
+class TestUtils:
+    def test_calculate_model_size(self):
+        # GPT
+        model_size = calculate_model_size(
+            8,
+            7,
+            None,
+            140,
+            300,
+            "gpt3",
+        )
+        assert model_size == 0.28, f"expected model_size is 0.28 but got {model_size}."
+
+        # Llama
+        model_size = calculate_model_size(
+            128,
+            30,
+            None,
+            100,
+            3000,
+            "llama",
+        )
+        assert model_size == 1.38, f"expected model_size is 1.38 but got {model_size}."
+
+        # Mixtral
+        model_size = calculate_model_size(
+            256,
+            20,
+            None,
+            140,
+            600,
+            "mixtral",
+        )
+        assert model_size == 12.9, f"expected model_size is 12.9 but got {model_size}."
+
+        # Mistral
+        model_size = calculate_model_size(
+            1028,
+            30,
+            None,
+            240,
+            100,
+            "mistral",
+        )
+        assert model_size == 799.37, f"expected model_size is 799.37 but got {model_size}."
+
+    def test_calculate_train_time(self):
+        # GPT
+        train_time = _estimate_training_time(
+            175,
+            1024,
+            140,
+            300,
+            "gpt3",
+        )
+        assert train_time == 33.91, f"expected train_time is 33.91 but got {train_time}."
+
+        # Llama
+        train_time = _estimate_training_time(
+            35,
+            512,
+            60,
+            3000,
+            "llama",
+        )
+        assert train_time == 316.48, f"expected train_time is 316.48 but got {train_time}."
+
+        # Mixtral
+        train_time = _estimate_training_time(
+            0.8,
+            128,
+            140,
+            1000,
+            "mixtral",
+        )
+        assert train_time == 4.13, f"expected train_time is 4.13 but got {train_time}."
+
+        # Mistral
+        train_time = _estimate_training_time(
+            11,
+            24,
+            60,
+            250,
+            "mistral",
+        )
+        assert train_time == 176.83, f"expected train_time is 176.83 but got {train_time}."
+
+    def test_calculate_model_params(self):
+        # GPT
+        params = calculate_model_size_params(
+            40,
+            51200,
+            2048,
+            "gpt3",
+        )
+        assert params == (
+            48,
+            8192,
+            64,
+            None,
+            None,
+            8e-05,
+        ), f"expected model_params set is (48, 8192, 64, None, None, 8e-05) but got {params}."
+
+        # Llama
+        params = calculate_model_size_params(
+            70,
+            32000,
+            8192,
+            "llama",
+        )
+        assert params == (
+            56,
+            10240,
+            80,
+            None,
+            None,
+            7e-05,
+        ), f"expected model_params set is (56, 10240, 80, None, None, 7e-05) but got {params}."
+
+        # Mixtral
+        params = calculate_model_size_params(
+            30,
+            32000,
+            4096,
+            "mixtral",
+        )
+        assert params == (
+            36,
+            8192,
+            64,
+            None,
+            None,
+            8e-05,
+        ), f"expected model_params set is (36, 8192, 64, None, None, 8e-05) but got {params}."
+
+        # Mistral
+        params = calculate_model_size_params(
+            0.5,
+            32000,
+            4096,
+            "mistral",
+        )
+        assert params == (
+            16,
+            1536,
+            16,
+            None,
+            None,
+            0.00025,
+        ), f"expected model_params set is (16, 1536, 16, None, None, 0.00025) but got {params}."

From 8189de9803dd161e0b7521db114b274e00af8d81 Mon Sep 17 00:00:00 2001
From: dimapihtar <dpihtar@gmail.com>
Date: Tue, 27 Aug 2024 06:42:31 -0700
Subject: [PATCH 07/63] add GPT configs

Signed-off-by: dimapihtar <dpihtar@gmail.com>
---
 nemo/collections/llm/gpt/model/__init__.py |  6 +++
 nemo/collections/llm/gpt/model/base.py     | 54 ++++++++++++++++++++++
 2 files changed, 60 insertions(+)

diff --git a/nemo/collections/llm/gpt/model/__init__.py b/nemo/collections/llm/gpt/model/__init__.py
index a0132a34d185..588bfc8b58b2 100644
--- a/nemo/collections/llm/gpt/model/__init__.py
+++ b/nemo/collections/llm/gpt/model/__init__.py
@@ -1,6 +1,12 @@
 from nemo.collections.llm.gpt.model.baichuan import Baichuan2Config, Baichuan2Config7B, Baichuan2Model
 from nemo.collections.llm.gpt.model.base import (
     GPTConfig,
+    GPTConfig5B,
+    GPTConfig7B,
+    GPTConfig20B,
+    GPTConfig40B,
+    GPTConfig126M,
+    GPTConfig175B,
     GPTModel,
     MaskedTokenLossReduction,
     gpt_data_step,
diff --git a/nemo/collections/llm/gpt/model/base.py b/nemo/collections/llm/gpt/model/base.py
index 2badfa2b1915..7a63f61530a3 100644
--- a/nemo/collections/llm/gpt/model/base.py
+++ b/nemo/collections/llm/gpt/model/base.py
@@ -148,6 +148,60 @@ def configure_model(self, tokenizer) -> "MCoreGPTModel":
         )
 
 
+@dataclass
+class GPTConfig126M(GPTConfig):
+    seq_length: int = 2048
+    num_layers: int = 12
+    hidden_size: int = 768
+    ffn_hidden_size: int = 3072
+    num_attention_heads: int = 12
+
+
+@dataclass
+class GPTConfig5B(GPTConfig):
+    seq_length: int = 2048
+    num_layers: int = 24
+    hidden_size: int = 4096
+    ffn_hidden_size: int = 16384
+    num_attention_heads: int = 32
+
+
+@dataclass
+class GPTConfig7B(GPTConfig):
+    seq_length: int = 2048
+    num_layers: int = 32
+    hidden_size: int = 4096
+    ffn_hidden_size: int = 10880
+    num_attention_heads: int = 32
+
+
+@dataclass
+class GPTConfig20B(GPTConfig):
+    seq_length: int = 2048
+    num_layers: int = 44
+    hidden_size: int = 6144
+    ffn_hidden_size: int = 24576
+    num_attention_heads: int = 48
+
+
+@dataclass
+class GPTConfig40B(GPTConfig):
+    seq_length: int = 2048
+    num_layers: int = 48
+    hidden_size: int = 8192
+    ffn_hidden_size: int = 32768
+    num_attention_heads: int = 64
+
+
+@dataclass
+class GPTConfig175B(GPTConfig):
+    seq_length: int = 2048
+    num_layers: int = 96
+    hidden_size: int = 12288
+    ffn_hidden_size: int = 49152
+    num_attention_heads: int = 96
+
+
 class GPTModel(L.LightningModule, io.IOMixin, io.ConnectorMixin, fn.FNMixin):
     def __init__(
         self,

From a55165876ea9478dc4c66866f4eb6bba4d78d637 Mon Sep 17 00:00:00 2001
From: dimapihtar <dpihtar@gmail.com>
Date: Tue, 27 Aug 2024 07:23:37 -0700
Subject: [PATCH 08/63] add GPT configs

Signed-off-by: dimapihtar <dpihtar@gmail.com>
---
 nemo/collections/llm/__init__.py                |  6 ++++++
 .../auto_configurator/core/training_config.py   | 17 +++++++++++------
 2 files changed, 17 insertions(+), 6 deletions(-)

diff --git a/nemo/collections/llm/__init__.py b/nemo/collections/llm/__init__.py
index 812daddf02b6..0a4eddc9b89e 100644
--- a/nemo/collections/llm/__init__.py
+++ b/nemo/collections/llm/__init__.py
@@ -32,6 +32,12 @@
     GemmaConfig2B,
     GemmaConfig7B,
     GemmaModel,
+    GPTConfig5B,
+    GPTConfig7B,
+    GPTConfig20B,
+    GPTConfig40B,
+    GPTConfig126M,
+    GPTConfig175B,
     GPTConfig,
     GPTModel,
     Llama2Config7B,
diff --git a/nemo/collections/llm/tools/auto_configurator/core/training_config.py b/nemo/collections/llm/tools/auto_configurator/core/training_config.py
index 56fc56e2feea..c50a7ab5ac61 100644
--- a/nemo/collections/llm/tools/auto_configurator/core/training_config.py
+++ b/nemo/collections/llm/tools/auto_configurator/core/training_config.py
@@ -302,19 +302,24 @@ class GPT3GridSearch80gb:
     """
 
     model_size_in_b: int = 5
-    valid_pp: List[int]
+    valid_pp: List[int] = field(default_factory=lambda: [])
     seq_length: int = 2048
     model_measure: str = "B"
 
-    tp: List[int] = field(default_factory=lambda: [1, 2, 4, 8])
-    pp: List[int] = field(default_factory=lambda: [1])
-    cp: List[int] = field(default_factory=lambda: [1])
-    ep: List[int] = field(default_factory=lambda: [1])
-    mbs: List[int] = field(default_factory=lambda: [1, 2, 3, 4, 6, 8])
+    #tp: List[int] = "1, 2, 4, 8"
+    #pp: List[int] = field(default_factory=lambda: [1])
+    #cp: List[int] = field(default_factory=lambda: [1])
+    #ep: List[int] = field(default_factory=lambda: [1])
+    #mbs: List[int] = field(default_factory=lambda: [1, 2, 3, 4, 6, 8])
     min_model_parallel: int = 1
     max_model_parallel: int = 8
     gbs: int = 1024
 
+    tp = [1, 2, 4, 8]
+    pp = [1]
+    cp = [1]
+    ep = [1]
+    mbs = [1, 2, 4, 8]
     model_size_in_b = model_size_in_b / 1000 if model_measure == "M" else model_size_in_b
 
     if seq_length == 2048:

From a28f77b2f954e289b36a13bdb3eaf3f517fd95f7 Mon Sep 17 00:00:00 2001
From: dimapihtar <dimapihtar@users.noreply.github.com>
Date: Tue, 27 Aug 2024 14:24:36 +0000
Subject: [PATCH 09/63] Apply isort and black reformatting

Signed-off-by: dimapihtar <dimapihtar@users.noreply.github.com>
---
 nemo/collections/llm/__init__.py                       |  2 +-
 .../tools/auto_configurator/core/training_config.py    | 10 +++++-----
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/nemo/collections/llm/__init__.py b/nemo/collections/llm/__init__.py
index 0a4eddc9b89e..69192f8a438c 100644
--- a/nemo/collections/llm/__init__.py
+++ b/nemo/collections/llm/__init__.py
@@ -32,13 +32,13 @@
     GemmaConfig2B,
     GemmaConfig7B,
     GemmaModel,
+    GPTConfig,
     GPTConfig5B,
     GPTConfig7B,
     GPTConfig20B,
     GPTConfig40B,
     GPTConfig126M,
     GPTConfig175B,
-    GPTConfig,
     GPTModel,
     Llama2Config7B,
     Llama2Config13B,
diff --git a/nemo/collections/llm/tools/auto_configurator/core/training_config.py b/nemo/collections/llm/tools/auto_configurator/core/training_config.py
index c50a7ab5ac61..a3ad493c269d 100644
--- a/nemo/collections/llm/tools/auto_configurator/core/training_config.py
+++ b/nemo/collections/llm/tools/auto_configurator/core/training_config.py
@@ -306,11 +306,11 @@ class GPT3GridSearch80gb:
     seq_length: int = 2048
     model_measure: str = "B"
 
-    #tp: List[int] = "1, 2, 4, 8"
-    #pp: List[int] = field(default_factory=lambda: [1])
-    #cp: List[int] = field(default_factory=lambda: [1])
-    #ep: List[int] = field(default_factory=lambda: [1])
-    #mbs: List[int] = field(default_factory=lambda: [1, 2, 3, 4, 6, 8])
+    # tp: List[int] = "1, 2, 4, 8"
+    # pp: List[int] = field(default_factory=lambda: [1])
+    # cp: List[int] = field(default_factory=lambda: [1])
+    # ep: List[int] = field(default_factory=lambda: [1])
+    # mbs: List[int] = field(default_factory=lambda: [1, 2, 3, 4, 6, 8])
     min_model_parallel: int = 1
     max_model_parallel: int = 8
     gbs: int = 1024

From 35522abd3b4ecb4f382f99f3fa46c93958c4483e Mon Sep 17 00:00:00 2001
From: dimapihtar <dpihtar@gmail.com>
Date: Tue, 27 Aug 2024 10:06:56 -0700
Subject: [PATCH 10/63] switch to dataclass

Signed-off-by: dimapihtar <dpihtar@gmail.com>
---
 nemo/collections/llm/gpt/model/base.py        |   2 +
 .../auto_configurator/core/training_config.py | 669 ++++++++----------
 2 files changed, 311 insertions(+), 360 deletions(-)

diff --git a/nemo/collections/llm/gpt/model/base.py b/nemo/collections/llm/gpt/model/base.py
index 7a63f61530a3..83ab7f8b7ccc 100644
--- a/nemo/collections/llm/gpt/model/base.py
+++ b/nemo/collections/llm/gpt/model/base.py
@@ -108,6 +108,8 @@ class GPTConfig(TransformerConfig, io.IOMixin):
     attention_softmax_in_fp32: bool = False
     masked_softmax_fusion: bool = True
     deallocate_pipeline_outputs = True
+    global_batch_size: Optional[int] = 256
+    activations_checkpoint_method: Optional[int] = None
 
     # TODO: Move this to better places?
     get_attention_mask_from_fusion: bool = False
diff --git a/nemo/collections/llm/tools/auto_configurator/core/training_config.py b/nemo/collections/llm/tools/auto_configurator/core/training_config.py
index a3ad493c269d..243a98e3b8a6 100644
--- a/nemo/collections/llm/tools/auto_configurator/core/training_config.py
+++ b/nemo/collections/llm/tools/auto_configurator/core/training_config.py
@@ -86,16 +86,7 @@ def generate_grid_search_configs(
     else:
         act_method = base_cfg["model"].encoder.activations_checkpoint_method
 
-    (
-        tp_list,
-        pp_list,
-        cp_list,
-        ep_list,
-        mbs_list,
-        min_model_parallel,
-        max_model_parallel,
-        gbs,
-    ) = _calculate_tp_pp_mbs_grid(
+    params = _calculate_tp_pp_mbs_grid(
         model_size_in_b=model_size_in_b,
         num_layers=num_layers,
         model_name=model_name,
@@ -108,13 +99,13 @@ def generate_grid_search_configs(
     num_nodes = train_cfg.get("num_nodes")
 
     valid_tp_pp_list = []
-    for tp in tp_list:
-        for pp in pp_list:
-            for cp in cp_list:
-                for ep in ep_list:
-                    for mbs in mbs_list:
+    for tp in params.tp:
+        for pp in params.pp:
+            for cp in params.cp:
+                for ep in params.ep:
+                    for mbs in params.mbs:
                         num_gpus = base_cfg["trainer"]["num_nodes"] * base_cfg["trainer"]["devices"]
-                        base_cfg["model"].global_batch_size = gbs
+                        base_cfg["model"].global_batch_size = params.gbs
                         if model_name in [
                             "gpt3",
                             "bert",
@@ -132,7 +123,7 @@ def generate_grid_search_configs(
                             att_heads = base_cfg["model"].encoder.num_attention_heads
                             num_layers = base_cfg["model"].encoder.num_layers
                         model_parallelism = (tp * pp * cp * ep) if (cp and ep) else (tp * pp)
-                        mod_gbs = gbs % (mbs * num_gpus / model_parallelism)
+                        mod_gbs = params.gbs % (mbs * num_gpus / model_parallelism)
                         mod_att_heads = att_heads % tp
                         mod_layers = (multiplier * num_layers) % pp
                         mod_cp = cp if cp else 1
@@ -143,7 +134,7 @@ def generate_grid_search_configs(
                             and mod_layers == 0
                             and (tp, pp, cp, ep) not in valid_tp_pp_list
                             and (mod_cp // mod_ep == mod_cp or mod_ep // mod_cp == mod_ep)
-                            and min_model_parallel <= model_parallelism <= max_model_parallel
+                            and params.min_model_parallel <= model_parallelism <= params.max_model_parallel
                         ):
                             valid_tp_pp_list.append((tp, pp, cp, ep))
 
@@ -167,7 +158,7 @@ def generate_grid_search_configs(
             model_name,
             model_measure,
         )
-        for mbs in mbs_list:
+        for mbs in params.mbs:
             kwargs = {
                 "base_cfg": base_cfg,
                 "act": None,
@@ -282,7 +273,7 @@ def _set_activations_checkpoint_params(
 
 
 @dataclass
-class GPT3GridSearch80gb:
+class GPT3GridSearch:
     """
     Selects grid search space for TP, PP, MBS parameters for GPT-3 and 80GB GPUs.
     :param float model_size_in_b: number of parameters in the model.
@@ -301,336 +292,308 @@ class GPT3GridSearch80gb:
         int gbs is the Global Batch Size to use for training.
     """
 
-    model_size_in_b: int = 5
-    valid_pp: List[int] = field(default_factory=lambda: [])
-    seq_length: int = 2048
-    model_measure: str = "B"
-
-    # tp: List[int] = "1, 2, 4, 8"
-    # pp: List[int] = field(default_factory=lambda: [1])
-    # cp: List[int] = field(default_factory=lambda: [1])
-    # ep: List[int] = field(default_factory=lambda: [1])
-    # mbs: List[int] = field(default_factory=lambda: [1, 2, 3, 4, 6, 8])
-    min_model_parallel: int = 1
-    max_model_parallel: int = 8
-    gbs: int = 1024
+    model_size_in_b: int
+    seq_length: int
+    gpu_size: int
+    valid_pp: List[int]
+    model_measure: str
 
     tp = [1, 2, 4, 8]
     pp = [1]
     cp = [1]
     ep = [1]
     mbs = [1, 2, 4, 8]
-    model_size_in_b = model_size_in_b / 1000 if model_measure == "M" else model_size_in_b
+    
+    gbs: int = 1024
+    min_model_parallel: int = 1
+    max_model_parallel: int = 8
 
-    if seq_length == 2048:
-        if model_size_in_b <= 1.0:
-            tp = [1, 2]
-            gbs = 256
-        elif model_size_in_b <= 4.0:
-            tp = [1, 2, 4]
-            gbs = 1024
-        elif model_size_in_b <= 8.0:
-            tp = [1, 2, 4]
-            gbs = 2048
-        elif model_size_in_b <= 13.0:
-            tp = [1, 2, 4, 8]
-            gbs = 2048
-        elif model_size_in_b <= 23.0:
-            tp = [1, 2, 4]
-            pp = [x for x in valid_pp if 1 <= x <= 4]
-            mbs = [1, 2, 4]
-            min_model_parallel = 4
-            max_model_parallel = 8
-            gbs = 2048
-        elif model_size_in_b <= 45.0:
-            tp = [2, 4, 8]
-            pp = [x for x in valid_pp if 1 <= x <= 4]
-            mbs = [1, 2, 4]
-            min_model_parallel = 8
-            max_model_parallel = 32
-            gbs = 2048
-        elif model_size_in_b <= 95:
-            tp = [2, 4, 8]
-            pp = [x for x in valid_pp if 1 <= x <= 8]
-            mbs = [1, 2, 4, 8]
-            min_model_parallel = 8
-            max_model_parallel = 64
-            gbs = 2048
-        elif model_size_in_b <= 130.0:
-            tp = [2, 4, 8]
-            pp = [x for x in valid_pp if 1 <= x <= 16]
-            mbs = [1, 2, 4, 8]
-            min_model_parallel = 16
-            max_model_parallel = 128
-            gbs = 2048
-        elif model_size_in_b <= 195.0:
-            tp = [8]
-            pp = [x for x in valid_pp if 4 <= x <= 16]
-            mbs = [1, 2, 4]
-            min_model_parallel = 32
-            max_model_parallel = 256
-            gbs = 2048
-        elif model_size_in_b <= 395.0:
-            tp = [8]
-            pp = [x for x in valid_pp if 8 <= x <= 32]
-            mbs = [1, 2, 4]
-            min_model_parallel = 64
-            max_model_parallel = 512
-            gbs = 2048
-        elif model_size_in_b <= 790.0:
-            tp = [8]
-            pp = [x for x in valid_pp if 8 <= x <= 100]
-            mbs = [1, 2, 4]
-            min_model_parallel = 128
-            max_model_parallel = 1024
-            gbs = 2048
-        elif model_size_in_b <= 1100.0:
-            tp = [8]
-            pp = [x for x in valid_pp if 16 <= x <= 130]
-            mbs = [1, 2, 4]
-            min_model_parallel = 256
-            max_model_parallel = 2048
-            gbs = 2048
-    elif seq_length == 4096:
-        if model_size_in_b <= 1.0:
-            tp = [1, 2, 4]
-            mbs = [1, 2, 4, 8]
-            gbs = 128
-        elif model_size_in_b <= 4.0:
-            tp = [1, 2, 4]
-            mbs = [1, 2, 4, 8]
-            gbs = 512
-        elif model_size_in_b <= 8.0:
-            tp = [1, 2, 4]
-            pp = [x for x in valid_pp if 1 <= x <= 2]
-            mbs = [1, 2, 4]
-            gbs = 1024
-        elif model_size_in_b <= 13.0:
-            tp = [2, 4]
-            pp = [x for x in valid_pp if 1 <= x <= 2]
-            mbs = [1, 2, 4]
-            gbs = 1024
-        elif model_size_in_b <= 23.0:
-            tp = [4, 8]
-            pp = [x for x in valid_pp if 1 <= x <= 2]
-            mbs = [1, 2]
-            min_model_parallel = 4
-            max_model_parallel = 16
-            gbs = 1024
-        elif model_size_in_b <= 45.0:
-            tp = [4, 8]
-            pp = [x for x in valid_pp if 2 <= x <= 4]
-            mbs = [1, 2]
-            min_model_parallel = 8
-            max_model_parallel = 32
-            gbs = 1024
-        elif model_size_in_b <= 95:
-            tp = [4, 8]
-            pp = [x for x in valid_pp if 1 <= x <= 8]
-            mbs = [1, 2]
-            min_model_parallel = 8
-            max_model_parallel = 64
-            gbs = 1024
-    elif seq_length == 8192:
-        if model_size_in_b <= 1.0:
-            tp = [1, 2]
-            pp = [x for x in valid_pp if 1 <= x <= 2]
-            mbs = [1, 2, 4]
-            gbs = 64
-        elif model_size_in_b <= 4.0:
-            tp = [1, 2, 4]
-            pp = [x for x in valid_pp if 1 <= x <= 2]
-            mbs = [1, 2, 4]
-            gbs = 128
-        elif model_size_in_b <= 8.0:
-            tp = [2, 4]
-            pp = [x for x in valid_pp if 1 <= x <= 2]
-            mbs = [1, 2]
-            gbs = 256
-        elif model_size_in_b <= 13.0:
-            tp = [2, 4]
-            pp = [x for x in valid_pp if 1 <= x <= 2]
-            mbs = [1, 2]
-            gbs = 256
-        elif model_size_in_b <= 23.0:
-            tp = [4, 8]
-            pp = [x for x in valid_pp if 1 <= x <= 4]
-            mbs = [1]
-            min_model_parallel = 8
-            max_model_parallel = 32
-            gbs = 256
-        elif model_size_in_b <= 45.0:
-            tp = [8]
-            pp = [x for x in valid_pp if 4 <= x <= 8]
-            mbs = [1]
-            min_model_parallel = 32
-            max_model_parallel = 64
-            gbs = 256
-    elif seq_length == 16384:
-        if model_size_in_b <= 1.0:
-            tp = [2, 4]
-            mbs = [1, 2]
-            gbs = 32
-        elif model_size_in_b <= 4.0:
-            tp = [2, 4]
-            pp = [x for x in valid_pp if 1 <= x <= 2]
-            mbs = [1]
-            gbs = 64
-        elif model_size_in_b <= 8.0:
-            tp = [2, 4]
-            pp = [x for x in valid_pp if 1 <= x <= 2]
-            mbs = [1]
-            gbs = 128
-        elif model_size_in_b <= 13.0:
-            tp = [2, 4]
-            pp = [x for x in valid_pp if 1 <= x <= 2]
-            mbs = [1]
-            gbs = 128
-        elif model_size_in_b <= 23.0:
-            tp = [4, 8]
-            pp = [x for x in valid_pp if 2 <= x <= 4]
-            mbs = [1]
-            min_model_parallel = 8
-            max_model_parallel = 32
-            gbs = 128
-    elif seq_length == 32768:
-        if model_size_in_b <= 1.0:
-            tp = [2, 4]
-            pp = [x for x in valid_pp if 1 <= x <= 2]
-            mbs = [1]
-            gbs = 16
-        elif model_size_in_b <= 4.0:
-            tp = [2, 4]
-            pp = [x for x in valid_pp if 1 <= x <= 2]
-            mbs = [1]
-            gbs = 32
-        elif model_size_in_b <= 8.0:
-            tp = [4, 8]
-            pp = [x for x in valid_pp if 1 <= x <= 2]
-            min_model_parallel = 4
-            max_model_parallel = 16
-            mbs = [1]
-            gbs = 64
-        elif model_size_in_b <= 13.0:
-            tp = [4, 8]
-            pp = [x for x in valid_pp if 1 <= x <= 2]
-            min_model_parallel = 4
-            max_model_parallel = 16
-            mbs = [1]
-            gbs = 64
-        elif model_size_in_b <= 23.0:
-            tp = [8]
-            pp = [x for x in valid_pp if 2 <= x <= 4]
-            mbs = [1]
-            min_model_parallel = 16
-            max_model_parallel = 32
-            gbs = 64
-
-
-def _tp_pp_mbs_grid_gpt3_40gb(model_size_in_b: float, valid_pp: List[int], model_measure: str) -> Tuple[int, int, int]:
-    """
-    Selects grid search space for TP, PP, MBS parameters for GPT-3 and 40GB GPUs.
-    :param float model_size_in_b: number of parameters in the model.
-    :param List[int] valid_pp: list of valid Pipeline Parallelism (PP) values for this config.
-    :param str model_measure: measure of model size (millions or billions).
-    :returns: tuple (tp, pp, cp, ep, mbs, min_model_parallel, max_model_parallel, gbs)
-        WHERE
-        int tp is the Tensor Parallelism value to use for training.
-        int pp is the Pipeline Parallelism value to use for training.
-        int cp is the Context Parallelism value to use for training.
-        int ep is the Expert Parallelism value to use for training.
-        int mbs is the Micro Batch Size to use for training.
-        int min_model_parallel is min Model parallel size to use for training.
-        int max_model_parallel is max Model parallel size to use for training.
-        int gbs is the Global Batch Size to use for training.
-    """
-    tp = [1, 2, 4, 8]
-    pp = [1]
-    cp = [1]
-    ep = [1]
-    mbs = [1, 2, 4, 6, 8, 10, 12, 16]
-    min_model_parallel = 1
-    max_model_parallel = 8
-    gbs = 1024
-    model_size_in_b = model_size_in_b / 1000 if model_measure == "M" else model_size_in_b
-    if model_size_in_b <= 1.0:
-        tp = [1, 2, 4]
-        mbs = [1, 2, 4, 8]
-        gbs = 256
-    elif model_size_in_b <= 4.0:
-        tp = [1, 2, 4, 8]
-        mbs = [1, 2, 4, 8]
-        gbs = 1024
-    elif model_size_in_b <= 8.0:
-        tp = [2, 4, 8]
-        pp = [1, 2]
-        mbs = [1, 2, 4]
-        min_model_parallel = 2
-        gbs = 2048
-    elif model_size_in_b <= 13.0:
-        tp = [4, 8]
-        pp = [1, 2, 4]
-        mbs = [1, 2, 4]
-        min_model_parallel = 4
-        max_model_parallel = 32
-        gbs = 2048
-    elif model_size_in_b <= 23.0:
-        tp = [2, 4, 8]
-        pp = [x for x in valid_pp if 1 <= x <= 8]
-        min_model_parallel = 8
-        max_model_parallel = 64
-        gbs = 2048
-    elif model_size_in_b <= 45.0:
-        tp = [4, 8]
-        pp = [x for x in valid_pp if 1 <= x <= 12]
-        mbs = [1, 2, 4]
-        min_model_parallel = 16
-        max_model_parallel = 128
-        gbs = 2048
-    elif model_size_in_b <= 95:
-        tp = [4, 8]
-        pp = [x for x in valid_pp if 1 <= x <= 16]
-        mbs = [1, 2, 4]
-        min_model_parallel = 16
-        max_model_parallel = 256
-        gbs = 2048
-    elif model_size_in_b <= 130.0:
-        tp = [4, 8]
-        pp = [x for x in valid_pp if 2 <= x <= 26]
-        mbs = [1, 2]
-        min_model_parallel = 32
-        max_model_parallel = 512
-        gbs = 2048
-    elif model_size_in_b <= 195.0:
-        tp = [4, 8]
-        pp = [x for x in valid_pp if 2 <= x <= 32]
-        mbs = [1, 2]
-        min_model_parallel = 64
-        max_model_parallel = 1024
-        gbs = 2048
-    elif model_size_in_b <= 395.0:
-        tp = [4, 8]
-        pp = [x for x in valid_pp if 4 <= x <= 64]
-        mbs = [1, 2]
-        min_model_parallel = 128
-        max_model_parallel = 2048
-        gbs = 2048
-    elif model_size_in_b <= 790.0:
-        tp = [4, 8]
-        pp = [x for x in valid_pp if 8 <= x <= 128]
-        mbs = [1, 2]
-        min_model_parallel = 256
-        max_model_parallel = 4096
-        gbs = 2048
-    elif model_size_in_b <= 1100.0:
-        tp = [4, 8]
-        pp = [x for x in valid_pp if 8 <= x <= 192]
-        mbs = [1, 2]
-        min_model_parallel = 512
-        max_model_parallel = 8192
-        gbs = 2048
-    return tp, pp, cp, ep, mbs, min_model_parallel, max_model_parallel, gbs
+    def init_params(self):
+        model_size_in_b = self.model_size_in_b / 1000 if self.model_measure == "M" else self.model_size_in_b
+        gpu_size = self.gpu_size
+        seq_length = self.seq_length
 
+        if gpu_size == 80:
+            if seq_length== 2048:
+                if model_size_in_b <= 1.0:
+                    self.tp = [1, 2]
+                    self.gbs = 256
+                elif model_size_in_b <= 4.0:
+                    self.tp = [1, 2, 4]
+                    self.gbs = 1024
+                elif model_size_in_b <= 8.0:
+                    self.tp = [1, 2, 4]
+                    self.gbs = 2048
+                elif model_size_in_b <= 13.0:
+                    self.tp = [1, 2, 4, 8]
+                    self.gbs = 2048
+                elif model_size_in_b <= 23.0:
+                    self.tp = [1, 2, 4]
+                    self.pp = [x for x in self.valid_pp if 1 <= x <= 4]
+                    self.mbs = [1, 2, 4]
+                    self.min_model_parallel = 4
+                    self.max_model_parallel = 8
+                    self.gbs = 2048
+                elif model_size_in_b <= 45.0:
+                    self.tp = [2, 4, 8]
+                    self.pp = [x for x in self.valid_pp if 1 <= x <= 4]
+                    self.mbs = [1, 2, 4]
+                    self.min_model_parallel = 8
+                    self.max_model_parallel = 32
+                    self.gbs = 2048
+                elif model_size_in_b <= 95:
+                    self.tp = [2, 4, 8]
+                    self.pp = [x for x in self.valid_pp if 1 <= x <= 8]
+                    self.mbs = [1, 2, 4, 8]
+                    self.min_model_parallel = 8
+                    self.max_model_parallel = 64
+                    self.gbs = 2048
+                elif model_size_in_b <= 130.0:
+                    self.tp = [2, 4, 8]
+                    self.pp = [x for x in self.valid_pp if 1 <= x <= 16]
+                    self.mbs = [1, 2, 4, 8]
+                    self.min_model_parallel = 16
+                    self.max_model_parallel = 128
+                    self.gbs = 2048
+                elif model_size_in_b <= 195.0:
+                    self.tp = [8]
+                    self.pp = [x for x in self.valid_pp if 4 <= x <= 16]
+                    self.mbs = [1, 2, 4]
+                    self.min_model_parallel = 32
+                    self.max_model_parallel = 256
+                    self.gbs = 2048
+                elif model_size_in_b <= 395.0:
+                    self.tp = [8]
+                    self.pp = [x for x in self.valid_pp if 8 <= x <= 32]
+                    self.mbs = [1, 2, 4]
+                    self.min_model_parallel = 64
+                    self.max_model_parallel = 512
+                    self.gbs = 2048
+                elif model_size_in_b <= 790.0:
+                    self.tp = [8]
+                    self.pp = [x for x in self.valid_pp if 8 <= x <= 100]
+                    self.mbs = [1, 2, 4]
+                    self.min_model_parallel = 128
+                    self.max_model_parallel = 1024
+                    self.gbs = 2048
+                elif model_size_in_b <= 1100.0:
+                    self.tp = [8]
+                    self.pp = [x for x in self.valid_pp if 16 <= x <= 130]
+                    self.mbs = [1, 2, 4]
+                    self.min_model_parallel = 256
+                    self.max_model_parallel = 2048
+                    self.gbs = 2048
+            elif seq_length== 4096:
+                if model_size_in_b <= 1.0:
+                    self.tp = [1, 2, 4]
+                    self.mbs = [1, 2, 4, 8]
+                    self.gbs = 128
+                elif model_size_in_b <= 4.0:
+                    self.tp = [1, 2, 4]
+                    self.mbs = [1, 2, 4, 8]
+                    self.gbs = 512
+                elif model_size_in_b <= 8.0:
+                    self.tp = [1, 2, 4]
+                    self.pp = [x for x in self.valid_pp if 1 <= x <= 2]
+                    self.mbs = [1, 2, 4]
+                    self.gbs = 1024
+                elif model_size_in_b <= 13.0:
+                    self.tp = [2, 4]
+                    self.pp = [x for x in self.valid_pp if 1 <= x <= 2]
+                    self.mbs = [1, 2, 4]
+                    self.gbs = 1024
+                elif model_size_in_b <= 23.0:
+                    self.tp = [4, 8]
+                    self.pp = [x for x in self.valid_pp if 1 <= x <= 2]
+                    self.mbs = [1, 2]
+                    self.min_model_parallel = 4
+                    self.max_model_parallel = 16
+                    self.gbs = 1024
+                elif model_size_in_b <= 45.0:
+                    self.tp = [4, 8]
+                    self.pp = [x for x in self.valid_pp if 2 <= x <= 4]
+                    self.mbs = [1, 2]
+                    self.min_model_parallel = 8
+                    self.max_model_parallel = 32
+                    self.gbs = 1024
+                elif model_size_in_b <= 95:
+                    self.tp = [4, 8]
+                    self.pp = [x for x in self.valid_pp if 1 <= x <= 8]
+                    self.mbs = [1, 2]
+                    self.min_model_parallel = 8
+                    self.max_model_parallel = 64
+                    self.gbs = 1024
+            elif seq_length== 8192:
+                if model_size_in_b <= 1.0:
+                    self.tp = [1, 2]
+                    self.pp = [x for x in self.valid_pp if 1 <= x <= 2]
+                    self.mbs = [1, 2, 4]
+                    self.gbs = 64
+                elif model_size_in_b <= 4.0:
+                    self.tp = [1, 2, 4]
+                    self.pp = [x for x in self.valid_pp if 1 <= x <= 2]
+                    self.mbs = [1, 2, 4]
+                    self.gbs = 128
+                elif model_size_in_b <= 8.0:
+                    self.tp = [2, 4]
+                    self.pp = [x for x in self.valid_pp if 1 <= x <= 2]
+                    self.mbs = [1, 2]
+                    self.gbs = 256
+                elif model_size_in_b <= 13.0:
+                    self.tp = [2, 4]
+                    self.pp = [x for x in self.valid_pp if 1 <= x <= 2]
+                    self.mbs = [1, 2]
+                    self.gbs = 256
+                elif model_size_in_b <= 23.0:
+                    self.tp = [4, 8]
+                    self.pp = [x for x in self.valid_pp if 1 <= x <= 4]
+                    self.mbs = [1]
+                    self.min_model_parallel = 8
+                    self.max_model_parallel = 32
+                    self.gbs = 256
+                elif model_size_in_b <= 45.0:
+                    self.tp = [8]
+                    self.pp = [x for x in self.valid_pp if 4 <= x <= 8]
+                    self.mbs = [1]
+                    self.min_model_parallel = 32
+                    self.max_model_parallel = 64
+                    self.gbs = 256
+            elif seq_length== 16384:
+                if model_size_in_b <= 1.0:
+                    self.tp = [2, 4]
+                    self.mbs = [1, 2]
+                    self.gbs = 32
+                elif model_size_in_b <= 4.0:
+                    self.tp = [2, 4]
+                    self.pp = [x for x in self.valid_pp if 1 <= x <= 2]
+                    self.mbs = [1]
+                    self.gbs = 64
+                elif model_size_in_b <= 8.0:
+                    self.tp = [2, 4]
+                    self.pp = [x for x in self.valid_pp if 1 <= x <= 2]
+                    self.mbs = [1]
+                    self.gbs = 128
+                elif model_size_in_b <= 13.0:
+                    self.tp = [2, 4]
+                    self.pp = [x for x in self.valid_pp if 1 <= x <= 2]
+                    self.mbs = [1]
+                    self.gbs = 128
+                elif model_size_in_b <= 23.0:
+                    self.tp = [4, 8]
+                    self.pp = [x for x in self.valid_pp if 2 <= x <= 4]
+                    self.mbs = [1]
+                    self.min_model_parallel = 8
+                    self.max_model_parallel = 32
+                    self.gbs = 128
+            elif seq_length== 32768:
+                if model_size_in_b <= 1.0:
+                    self.tp = [2, 4]
+                    self.pp = [x for x in self.valid_pp if 1 <= x <= 2]
+                    self.mbs = [1]
+                    self.gbs = 16
+                elif model_size_in_b <= 4.0:
+                    self.tp = [2, 4]
+                    self.pp = [x for x in self.valid_pp if 1 <= x <= 2]
+                    self.mbs = [1]
+                    self.gbs = 32
+                elif model_size_in_b <= 8.0:
+                    self.tp = [4, 8]
+                    self.pp = [x for x in self.valid_pp if 1 <= x <= 2]
+                    self.min_model_parallel = 4
+                    self.max_model_parallel = 16
+                    self.mbs = [1]
+                    self.gbs = 64
+                elif model_size_in_b <= 13.0:
+                    self.tp = [4, 8]
+                    self.pp = [x for x in self.valid_pp if 1 <= x <= 2]
+                    self.min_model_parallel = 4
+                    self.max_model_parallel = 16
+                    self.mbs = [1]
+                    self.gbs = 64
+                elif model_size_in_b <= 23.0:
+                    self.tp = [8]
+                    self.pp = [x for x in self.valid_pp if 2 <= x <= 4]
+                    self.mbs = [1]
+                    self.min_model_parallel = 16
+                    self.max_model_parallel = 32
+                    self.gbs = 64
+        elif gpu_size == 40:
+            if model_size_in_b <= 1.0:
+                self.tp = [1, 2, 4]
+                self.mbs = [1, 2, 4, 8]
+                self.gbs = 256
+            elif model_size_in_b <= 4.0:
+                self.tp = [1, 2, 4, 8]
+                self.mbs = [1, 2, 4, 8]
+                self.gbs = 1024
+            elif model_size_in_b <= 8.0:
+                self.tp = [2, 4, 8]
+                self.pp = [1, 2]
+                self.mbs = [1, 2, 4]
+                self.min_model_parallel = 2
+                self.gbs = 2048
+            elif model_size_in_b <= 13.0:
+                self.tp = [4, 8]
+                self.pp = [1, 2, 4]
+                self.mbs = [1, 2, 4]
+                self.min_model_parallel = 4
+                self.max_model_parallel = 32
+                self.gbs = 2048
+            elif model_size_in_b <= 23.0:
+                self.tp = [2, 4, 8]
+                self.pp = [x for x in self.valid_pp if 1 <= x <= 8]
+                self.min_model_parallel = 8
+                self.max_model_parallel = 64
+                self.gbs = 2048
+            elif model_size_in_b <= 45.0:
+                self.tp = [4, 8]
+                self.pp = [x for x in self.valid_pp if 1 <= x <= 12]
+                self.mbs = [1, 2, 4]
+                self.min_model_parallel = 16
+                self.max_model_parallel = 128
+                self.gbs = 2048
+            elif model_size_in_b <= 95:
+                self.tp = [4, 8]
+                self.pp = [x for x in self.valid_pp if 1 <= x <= 16]
+                self.mbs = [1, 2, 4]
+                self.min_model_parallel = 16
+                self.max_model_parallel = 256
+                self.gbs = 2048
+            elif model_size_in_b <= 130.0:
+                self.tp = [4, 8]
+                self.pp = [x for x in self.valid_pp if 2 <= x <= 26]
+                self.mbs = [1, 2]
+                self.min_model_parallel = 32
+                self.max_model_parallel = 512
+                self.gbs = 2048
+            elif model_size_in_b <= 195.0:
+                self.tp = [4, 8]
+                self.pp = [x for x in self.valid_pp if 2 <= x <= 32]
+                self.mbs = [1, 2]
+                self.min_model_parallel = 64
+                self.max_model_parallel = 1024
+                self.gbs = 2048
+            elif model_size_in_b <= 395.0:
+                self.tp = [4, 8]
+                self.pp = [x for x in self.valid_pp if 4 <= x <= 64]
+                self.mbs = [1, 2]
+                self.min_model_parallel = 128
+                self.max_model_parallel = 2048
+                self.gbs = 2048
+            elif model_size_in_b <= 790.0:
+                self.tp = [4, 8]
+                self.pp = [x for x in self.valid_pp if 8 <= x <= 128]
+                self.mbs = [1, 2]
+                self.min_model_parallel = 256
+                self.max_model_parallel = 4096
+                self.gbs = 2048
+            elif model_size_in_b <= 1100.0:
+                self.tp = [4, 8]
+                self.pp = [x for x in self.valid_pp if 8 <= x <= 192]
+                self.mbs = [1, 2]
+                self.min_model_parallel = 512
+                self.max_model_parallel = 8192
+                self.gbs = 2048
 
 def _tp_pp_mbs_grid_t5_80gb(model_size_in_b: float, valid_pp: List[int], model_measure: str) -> Tuple[int, int, int]:
     """
@@ -1002,24 +965,10 @@ def _calculate_tp_pp_mbs_grid(
     ]  # Only divisors of num_layers are possible.
 
     if model_name in ["gpt3", "llama", "baichuan2", "chatglm", "qwen2", "mixtral", "mistral", "gemma"]:
-        if gpu_memory_gb == 80:
-            print(model_size_in_b, valid_pp, seq_length, model_measure)
-            params = GPT3GridSearch80gb(
-                model_size_in_b=model_size_in_b, valid_pp=valid_pp, seq_length=seq_length, model_measure=model_measure
-            )
-        elif gpu_memory_gb == 40:
-            (
-                tp,
-                pp,
-                cp,
-                ep,
-                mbs,
-                min_model_parallel,
-                max_model_parallel,
-                gbs,
-            ) = _tp_pp_mbs_grid_gpt3_40gb(
-                model_size_in_b=model_size_in_b, valid_pp=valid_pp, model_measure=model_measure
-            )
+        params = GPT3GridSearch(
+            model_size_in_b=model_size_in_b, valid_pp=valid_pp, seq_length=seq_length, model_measure=model_measure, gpu_size=gpu_memory_gb
+        )
+        params.init_params()
     elif model_name in ["t5", "mt5"]:
         if gpu_memory_gb == 80:
             (

From 399385ba8562f39565d45d9ef4d78191449633e4 Mon Sep 17 00:00:00 2001
From: dimapihtar <dimapihtar@users.noreply.github.com>
Date: Tue, 27 Aug 2024 17:08:06 +0000
Subject: [PATCH 11/63] Apply isort and black reformatting

Signed-off-by: dimapihtar <dimapihtar@users.noreply.github.com>
---
 .../auto_configurator/core/training_config.py | 19 ++++++++++++-------
 1 file changed, 12 insertions(+), 7 deletions(-)

diff --git a/nemo/collections/llm/tools/auto_configurator/core/training_config.py b/nemo/collections/llm/tools/auto_configurator/core/training_config.py
index 243a98e3b8a6..6c96d58a6159 100644
--- a/nemo/collections/llm/tools/auto_configurator/core/training_config.py
+++ b/nemo/collections/llm/tools/auto_configurator/core/training_config.py
@@ -303,7 +303,7 @@ class GPT3GridSearch:
     cp = [1]
     ep = [1]
     mbs = [1, 2, 4, 8]
-    
+
     gbs: int = 1024
     min_model_parallel: int = 1
     max_model_parallel: int = 8
@@ -314,7 +314,7 @@ def init_params(self):
         seq_length = self.seq_length
 
         if gpu_size == 80:
-            if seq_length== 2048:
+            if seq_length == 2048:
                 if model_size_in_b <= 1.0:
                     self.tp = [1, 2]
                     self.gbs = 256
@@ -383,7 +383,7 @@ def init_params(self):
                     self.min_model_parallel = 256
                     self.max_model_parallel = 2048
                     self.gbs = 2048
-            elif seq_length== 4096:
+            elif seq_length == 4096:
                 if model_size_in_b <= 1.0:
                     self.tp = [1, 2, 4]
                     self.mbs = [1, 2, 4, 8]
@@ -423,7 +423,7 @@ def init_params(self):
                     self.min_model_parallel = 8
                     self.max_model_parallel = 64
                     self.gbs = 1024
-            elif seq_length== 8192:
+            elif seq_length == 8192:
                 if model_size_in_b <= 1.0:
                     self.tp = [1, 2]
                     self.pp = [x for x in self.valid_pp if 1 <= x <= 2]
@@ -458,7 +458,7 @@ def init_params(self):
                     self.min_model_parallel = 32
                     self.max_model_parallel = 64
                     self.gbs = 256
-            elif seq_length== 16384:
+            elif seq_length == 16384:
                 if model_size_in_b <= 1.0:
                     self.tp = [2, 4]
                     self.mbs = [1, 2]
@@ -485,7 +485,7 @@ def init_params(self):
                     self.min_model_parallel = 8
                     self.max_model_parallel = 32
                     self.gbs = 128
-            elif seq_length== 32768:
+            elif seq_length == 32768:
                 if model_size_in_b <= 1.0:
                     self.tp = [2, 4]
                     self.pp = [x for x in self.valid_pp if 1 <= x <= 2]
@@ -595,6 +595,7 @@ def init_params(self):
                 self.max_model_parallel = 8192
                 self.gbs = 2048
 
+
 def _tp_pp_mbs_grid_t5_80gb(model_size_in_b: float, valid_pp: List[int], model_measure: str) -> Tuple[int, int, int]:
     """
     Selects grid search space for TP, PP, MBS parameters for T5/mT5 and 80GB GPUs.
@@ -966,7 +967,11 @@ def _calculate_tp_pp_mbs_grid(
 
     if model_name in ["gpt3", "llama", "baichuan2", "chatglm", "qwen2", "mixtral", "mistral", "gemma"]:
         params = GPT3GridSearch(
-            model_size_in_b=model_size_in_b, valid_pp=valid_pp, seq_length=seq_length, model_measure=model_measure, gpu_size=gpu_memory_gb
+            model_size_in_b=model_size_in_b,
+            valid_pp=valid_pp,
+            seq_length=seq_length,
+            model_measure=model_measure,
+            gpu_size=gpu_memory_gb,
         )
         params.init_params()
     elif model_name in ["t5", "mt5"]:

From b616b41c68b6e2667cc994d638efec593527b6cf Mon Sep 17 00:00:00 2001
From: dimapihtar <dpihtar@gmail.com>
Date: Tue, 27 Aug 2024 12:13:31 -0700
Subject: [PATCH 12/63] switch to dataclass

Signed-off-by: dimapihtar <dpihtar@gmail.com>
---
 .../auto_configurator/core/training_config.py | 578 ++++++++----------
 1 file changed, 254 insertions(+), 324 deletions(-)

diff --git a/nemo/collections/llm/tools/auto_configurator/core/training_config.py b/nemo/collections/llm/tools/auto_configurator/core/training_config.py
index 6c96d58a6159..052397682fb3 100644
--- a/nemo/collections/llm/tools/auto_configurator/core/training_config.py
+++ b/nemo/collections/llm/tools/auto_configurator/core/training_config.py
@@ -596,7 +596,8 @@ def init_params(self):
                 self.gbs = 2048
 
 
-def _tp_pp_mbs_grid_t5_80gb(model_size_in_b: float, valid_pp: List[int], model_measure: str) -> Tuple[int, int, int]:
+@dataclass
+class T5GridSearch:
     """
     Selects grid search space for TP, PP, MBS parameters for T5/mT5 and 80GB GPUs.
     :param float model_size_in_b: number of parameters in the model.
@@ -613,153 +614,139 @@ def _tp_pp_mbs_grid_t5_80gb(model_size_in_b: float, valid_pp: List[int], model_m
         int max_model_parallel is max Model parallel size to use for training.
         int gbs is the Global Batch Size to use for training.
     """
-    tp = [1, 2, 4, 8]
-    pp = [1]
-    cp = [None]
-    ep = [None]
-    mbs = [1, 2, 4, 6, 8, 12, 16]
-    min_model_parallel = 1
-    max_model_parallel = 8
-    gbs = 1920
-    model_size_in_b = model_size_in_b / 1000 if model_measure == "M" else model_size_in_b
-    if model_size_in_b <= 1.0:
-        tp = [1, 2]
-        mbs = [16, 32, 64, 128]
-        gbs = 2048
-    elif model_size_in_b <= 4.0:
-        tp = [1, 2, 4]
-        mbs = [4, 6, 8, 12, 16, 24, 32, 48]
-        gbs = 1920
-    elif model_size_in_b <= 8.0:
-        tp = [2, 4, 8]
-        mbs = [4, 6, 8, 12, 16, 24, 32]
-        gbs = 1920
-    elif model_size_in_b <= 14.5:
-        tp = [4, 8]
-        mbs = [2, 4, 6, 8, 12, 16, 24]
-        gbs = 1920
-    elif model_size_in_b <= 25.9:
-        tp = [4, 8]
-        pp = [x for x in valid_pp if 1 <= x <= 2]
-        mbs = [1, 2, 4, 6, 8]
-        min_model_parallel = 4
-        max_model_parallel = 16
-        gbs = 1920
-    elif model_size_in_b <= 43.0:
-        tp = [4, 8]
-        pp = [x for x in valid_pp if 1 <= x <= 4]
-        mbs = [1, 2, 4, 6, 8]
-        min_model_parallel = 8
-        max_model_parallel = 32
-        gbs = 1920
-    elif model_size_in_b <= 85.5:
-        tp = [4, 8]
-        pp = [x for x in valid_pp if 2 <= x <= 8]
-        mbs = [1, 2, 4, 6, 8]
-        min_model_parallel = 16
-        max_model_parallel = 64
-        gbs = 1920
-    elif model_size_in_b <= 165.5:
-        tp = [8]
-        pp = [x for x in valid_pp if 4 <= x <= 16]
-        mbs = [1, 2, 4, 6]
-        min_model_parallel = 32
-        max_model_parallel = 128
-        gbs = 1920
-    elif model_size_in_b <= 250:
-        tp = [8]
-        pp = [x for x in valid_pp if 4 <= x <= 32]
-        mbs = [1, 2, 4, 6, 8]
-        min_model_parallel = 64
-        max_model_parallel = 256
-        gbs = 1920
-    return tp, pp, cp, ep, mbs, min_model_parallel, max_model_parallel, gbs
 
+    model_size_in_b: int
+    seq_length: int
+    gpu_size: int
+    valid_pp: List[int]
+    model_measure: str
 
-def _tp_pp_mbs_grid_t5_40gb(model_size_in_b: float, valid_pp: List[int], model_measure: str) -> Tuple[int, int, int]:
-    """
-    Selects grid search space for TP, PP, MBS parameters for T5/mT5 and 40GB GPUs.
-    :param float model_size_in_b: number of parameters in the model.
-    :param List[int] valid_pp: list of valid Pipeline Parallelism (PP) values for this config.
-    :param str model_measure: measure of model size (millions or billions).
-    :returns: tuple (tp, pp, cp, ep, mbs, min_model_parallel, max_model_parallel, gbs)
-        WHERE
-        int tp is the Tensor Parallelism value to use for training.
-        int pp is the Pipeline Parallelism value to use for training.
-        int cp is the Context Parallelism value to use for training.
-        int ep is the Expert Parallelism value to use for training.
-        int mbs is the Micro Batch Size to use for training.
-        int min_model_parallel is min Model parallel size to use for training.
-        int max_model_parallel is max Model parallel size to use for training.
-        int gbs is the Global Batch Size to use for training.
-    """
     tp = [1, 2, 4, 8]
     pp = [1]
     cp = [None]
     ep = [None]
     mbs = [1, 2, 4, 6, 8, 12, 16]
-    min_model_parallel = 1
-    max_model_parallel = 8
-    gbs = 1920
-    model_size_in_b = model_size_in_b / 1000 if model_measure == "M" else model_size_in_b
-    if model_size_in_b <= 1.0:
-        tp = [1, 2]
-        mbs = [16, 32, 64, 128]
-        gbs = 2048
-    elif model_size_in_b <= 4.0:
-        tp = [1, 2, 4]
-        mbs = [4, 8, 12, 16, 24, 32, 48]
-        gbs = 1920
-    elif model_size_in_b <= 8.0:
-        tp = [2, 4, 8]
-        mbs = [4, 6, 8, 12, 16, 24]
-        gbs = 1920
-    elif model_size_in_b <= 14.5:
-        tp = [4, 8]
-        pp = [x for x in valid_pp if 1 <= x <= 2]
-        mbs = [2, 4, 6, 8, 12, 16]
-        min_model_parallel = 4
-        max_model_parallel = 16
-        gbs = 1920
-    elif model_size_in_b <= 25.9:
-        tp = [4, 8]
-        pp = [x for x in valid_pp if 1 <= x <= 8]
-        mbs = [1, 2, 4, 6, 8]
-        min_model_parallel = 8
-        max_model_parallel = 32
-        gbs = 1920
-    elif model_size_in_b <= 43.0:
-        tp = [4, 8]
-        pp = [x for x in valid_pp if 1 <= x <= 8]
-        mbs = [1, 2, 4, 6, 8]
-        min_model_parallel = 16
-        max_model_parallel = 32
-        gbs = 1920
-    elif model_size_in_b <= 85.5:
-        tp = [8]
-        pp = [x for x in valid_pp if 2 <= x <= 8]
-        mbs = [1, 2, 4, 6, 8]
-        min_model_parallel = 32
-        max_model_parallel = 64
-        gbs = 1920
-    elif model_size_in_b <= 165.5:
-        tp = [8]
-        pp = [x for x in valid_pp if 4 <= x <= 32]
-        mbs = [1, 2, 4]
-        min_model_parallel = 64
-        max_model_parallel = 128
-        gbs = 1920
-    elif model_size_in_b <= 250:
-        tp = [8]
-        pp = [x for x in valid_pp if 8 <= x <= 64]
-        mbs = [1, 2, 4]
-        min_model_parallel = 128
-        max_model_parallel = 256
-        gbs = 1920
-    return tp, pp, cp, ep, mbs, min_model_parallel, max_model_parallel, gbs
+    
+    gbs: int = 1920
+    min_model_parallel: int = 1
+    max_model_parallel: int = 8
 
+    def init_params(self):
+        model_size_in_b = self.model_size_in_b / 1000 if self.model_measure == "M" else self.model_size_in_b
+        gpu_size = self.gpu_size
+        seq_length = self.seq_length
 
-def _tp_pp_mbs_grid_bert_80gb(model_size_in_b: float, valid_pp: List[int], model_measure: str) -> Tuple[int, int, int]:
+        if gpu_size == 80:
+            if model_size_in_b <= 1.0:
+                self.tp = [1, 2]
+                self.mbs = [16, 32, 64, 128]
+                self.gbs = 2048
+            elif model_size_in_b <= 4.0:
+                self.tp = [1, 2, 4]
+                self.mbs = [4, 6, 8, 12, 16, 24, 32, 48]
+                self.gbs = 1920
+            elif model_size_in_b <= 8.0:
+                self.tp = [2, 4, 8]
+                self.mbs = [4, 6, 8, 12, 16, 24, 32]
+                self.gbs = 1920
+            elif model_size_in_b <= 14.5:
+                self.tp = [4, 8]
+                self.mbs = [2, 4, 6, 8, 12, 16, 24]
+                self.gbs = 1920
+            elif model_size_in_b <= 25.9:
+                self.tp = [4, 8]
+                self.pp = [x for x in self.valid_pp if 1 <= x <= 2]
+                self.mbs = [1, 2, 4, 6, 8]
+                self.min_model_parallel = 4
+                self.max_model_parallel = 16
+                self.gbs = 1920
+            elif model_size_in_b <= 43.0:
+                self.tp = [4, 8]
+                self.pp = [x for x in self.valid_pp if 1 <= x <= 4]
+                self.mbs = [1, 2, 4, 6, 8]
+                self.min_model_parallel = 8
+                self.max_model_parallel = 32
+                self.gbs = 1920
+            elif model_size_in_b <= 85.5:
+                self.tp = [4, 8]
+                self.pp = [x for x in self.valid_pp if 2 <= x <= 8]
+                self.mbs = [1, 2, 4, 6, 8]
+                self.min_model_parallel = 16
+                self.max_model_parallel = 64
+                self.gbs = 1920
+            elif model_size_in_b <= 165.5:
+                self.tp = [8]
+                self.pp = [x for x in self.valid_pp if 4 <= x <= 16]
+                self.mbs = [1, 2, 4, 6]
+                self.min_model_parallel = 32
+                self.max_model_parallel = 128
+                self.gbs = 1920
+            elif model_size_in_b <= 250:
+                self.tp = [8]
+                self.pp = [x for x in self.valid_pp if 4 <= x <= 32]
+                self.mbs = [1, 2, 4, 6, 8]
+                self.min_model_parallel = 64
+                self.max_model_parallel = 256
+                self.gbs = 1920
+        elif gpu_size == 40:
+            if model_size_in_b <= 1.0:
+                self.tp = [1, 2]
+                self.mbs = [16, 32, 64, 128]
+                self.gbs = 2048
+            elif model_size_in_b <= 4.0:
+                self.tp = [1, 2, 4]
+                self.mbs = [4, 8, 12, 16, 24, 32, 48]
+                self.gbs = 1920
+            elif model_size_in_b <= 8.0:
+                self.tp = [2, 4, 8]
+                self.mbs = [4, 6, 8, 12, 16, 24]
+                self.gbs = 1920
+            elif model_size_in_b <= 14.5:
+                self.tp = [4, 8]
+                self.pp = [x for x in self.valid_pp if 1 <= x <= 2]
+                self.mbs = [2, 4, 6, 8, 12, 16]
+                self.min_model_parallel = 4
+                self.max_model_parallel = 16
+                self.gbs = 1920
+            elif model_size_in_b <= 25.9:
+                self.tp = [4, 8]
+                self.pp = [x for x in self.valid_pp if 1 <= x <= 8]
+                self.mbs = [1, 2, 4, 6, 8]
+                self.min_model_parallel = 8
+                self.max_model_parallel = 32
+                self.gbs = 1920
+            elif model_size_in_b <= 43.0:
+                self.tp = [4, 8]
+                self.pp = [x for x in self.valid_pp if 1 <= x <= 8]
+                self.mbs = [1, 2, 4, 6, 8]
+                self.min_model_parallel = 16
+                self.max_model_parallel = 32
+                self.gbs = 1920
+            elif model_size_in_b <= 85.5:
+                self.tp = [8]
+                self.pp = [x for x in self.valid_pp if 2 <= x <= 8]
+                self.mbs = [1, 2, 4, 6, 8]
+                self.min_model_parallel = 32
+                self.max_model_parallel = 64
+                self.gbs = 1920
+            elif model_size_in_b <= 165.5:
+                self.tp = [8]
+                self.pp = [x for x in self.valid_pp if 4 <= x <= 32]
+                self.mbs = [1, 2, 4]
+                self.min_model_parallel = 64
+                self.max_model_parallel = 128
+                self.gbs = 1920
+            elif model_size_in_b <= 250:
+                self.tp = [8]
+                self.pp = [x for x in self.valid_pp if 8 <= x <= 64]
+                self.mbs = [1, 2, 4]
+                self.min_model_parallel = 128
+                self.max_model_parallel = 256
+                self.gbs = 1920
+
+
+@dataclass
+class BertGridSearch:
     """
     Selects grid search space for TP, PP, MBS parameters for BERT and 80GB GPUs.
     :param float model_size_in_b: number of parameters in the model.
@@ -776,144 +763,131 @@ def _tp_pp_mbs_grid_bert_80gb(model_size_in_b: float, valid_pp: List[int], model
         int max_model_parallel is max Model parallel size to use for training.
         int gbs is the Global Batch Size to use for training.
     """
-    pp = [1]
-    cp = [None]
-    ep = [None]
-    mbs = [1, 2, 3, 4, 6, 8]
-    min_model_parallel = 1
-    max_model_parallel = 8
-    gbs = 1024
-    model_size_in_b = model_size_in_b / 1000 if model_measure == "M" else model_size_in_b
-    if model_size_in_b <= 1.0:
-        tp = [1, 2]
-        gbs = 256
-    elif model_size_in_b <= 4.0:
-        tp = [1, 2, 4]
-        gbs = 1024
-    elif model_size_in_b <= 8.0:
-        tp = [2, 4, 8]
-        min_model_parallel = 2
-        gbs = 2048
-    elif model_size_in_b <= 13.0:
-        tp = [2, 4, 8]
-        mbs = [1, 2, 3, 4, 6]
-        min_model_parallel = 2
-        gbs = 2048
-    elif model_size_in_b <= 25.0:
-        tp = [4, 8]
-        mbs = [1, 2, 3, 4]
-        min_model_parallel = 4
-        gbs = 2048
-    elif model_size_in_b <= 46.5:
-        tp = [4, 8]
-        pp = [1, 2, 4]
-        mbs = [1, 2, 3, 4]
-        min_model_parallel = 4
-        max_model_parallel = 16
-        gbs = 2048
-    elif model_size_in_b <= 87.5:
-        tp = [4, 8]
-        pp = [2, 4, 6, 8]
-        mbs = [1, 2, 3, 4]
-        min_model_parallel = 8
-        max_model_parallel = 32
-        gbs = 2048
-    elif model_size_in_b <= 165.5:
-        tp = [4, 8]
-        pp = [4, 6, 8, 16]
-        mbs = [2, 4, 6, 8]
-        min_model_parallel = 16
-        max_model_parallel = 128
-        gbs = 2048
-    elif model_size_in_b <= 250.5:
-        tp = [8]
-        pp = [4, 8, 16, 32]
-        mbs = [1, 2, 3, 4]
-        min_model_parallel = 32
-        max_model_parallel = 256
-        gbs = 2048
-    else:
-        raise ValueError("No BERT model larger than 250B parameters is supported.")
-    return tp, pp, cp, ep, mbs, min_model_parallel, max_model_parallel, gbs
 
+    model_size_in_b: int
+    seq_length: int
+    gpu_size: int
+    valid_pp: List[int]
+    model_measure: str
 
-def _tp_pp_mbs_grid_bert_40gb(model_size_in_b: float, valid_pp: List[int], model_measure: str) -> Tuple[int, int, int]:
-    """
-    Selects grid search space for TP, PP, MBS parameters for BERT and 40GB GPUs.
-    :param float model_size_in_b: number of parameters in the model.
-    :param List[int] valid_pp: list of valid Pipeline Parallelism (PP) values for this config.
-    :param str model_measure: measure of model size (millions or billions).
-    :returns: tuple (tp, pp, cp, ep, mbs, min_model_parallel, max_model_parallel, gbs)
-        WHERE
-        int tp is the Tensor Parallelism value to use for training.
-        int pp is the Pipeline Parallelism value to use for training.
-        int cp is the Context Parallelism value to use for training.
-        int ep is the Expert Parallelism value to use for training.
-        int mbs is the Micro Batch Size to use for training.
-        int min_model_parallel is min Model parallel size to use for training.
-        int max_model_parallel is max Model parallel size to use for training.
-        int gbs is the Global Batch Size to use for training.
-    """
+    tp = [1, 2, 4, 8]
     pp = [1]
     cp = [None]
     ep = [None]
-    mbs = [1, 2, 4, 6, 8]
-    min_model_parallel = 1
-    max_model_parallel = 8
-    gbs = 1024
-    model_size_in_b = model_size_in_b / 1000 if model_measure == "M" else model_size_in_b
-    if model_size_in_b <= 1.0:
-        tp = [1, 2, 4]
-        gbs = 256
-    elif model_size_in_b <= 4.0:
-        tp = [1, 2, 4, 8]
-        gbs = 1024
-    elif model_size_in_b <= 8.0:
-        tp = [2, 4, 8]
-        mbs = [1, 2, 4]
-        gbs = 2048
-    elif model_size_in_b <= 13.0:
-        tp = [2, 4, 8]
-        mbs = [1, 2, 4]
-        gbs = 2048
-    elif model_size_in_b <= 25.0:
-        tp = [2, 4, 8]
-        pp = [1, 2]
-        mbs = [1, 2, 4]
-        min_model_parallel = 2
-        max_model_parallel = 16
-        gbs = 2048
-    elif model_size_in_b <= 46.5:
-        tp = [4, 8]
-        pp = [1, 2, 4, 8]
-        mbs = [1, 2, 3]
-        min_model_parallel = 8
-        max_model_parallel = 32
-        gbs = 2048
-    elif model_size_in_b <= 87.5:
-        tp = [4, 8]
-        pp = [2, 4, 6, 8]
-        mbs = [1, 2, 3]
-        min_model_parallel = 16
-        max_model_parallel = 64
-        gbs = 2048
-    elif model_size_in_b <= 165.5:
-        tp = [8]
-        pp = [4, 6, 8, 16]
-        mbs = [1, 2]
-        min_model_parallel = 32
-        max_model_parallel = 256
-        gbs = 2048
-    elif model_size_in_b <= 250.5:
-        tp = [8]
-        pp = [8, 16, 32]
-        mbs = [1, 2]
-        min_model_parallel = 64
-        max_model_parallel = 512
-        gbs = 2048
-    else:
-        raise ValueError("No BERT model larger than 250B parameters is supported.")
-    return tp, pp, cp, ep, mbs, min_model_parallel, max_model_parallel, gbs
+    mbs = [1, 2, 4, 6, 8, 12, 16]
+    
+    gbs: int = 1920
+    min_model_parallel: int = 1
+    max_model_parallel: int = 8
+
+    def init_params(self):
+        model_size_in_b = self.model_size_in_b / 1000 if self.model_measure == "M" else self.model_size_in_b
+        gpu_size = self.gpu_size
+        seq_length = self.seq_length
+
+        if gpu_size == 80:
+            if model_size_in_b <= 1.0:
+                self.tp = [1, 2]
+                self.gbs = 256
+            elif model_size_in_b <= 4.0:
+                self.tp = [1, 2, 4]
+                self.gbs = 1024
+            elif model_size_in_b <= 8.0:
+                self.tp = [2, 4, 8]
+                self.min_model_parallel = 2
+                self.gbs = 2048
+            elif model_size_in_b <= 13.0:
+                self.tp = [2, 4, 8]
+                self.mbs = [1, 2, 3, 4, 6]
+                self.min_model_parallel = 2
+                self.gbs = 2048
+            elif model_size_in_b <= 25.0:
+                self.tp = [4, 8]
+                self.mbs = [1, 2, 3, 4]
+                self.min_model_parallel = 4
+                self.gbs = 2048
+            elif model_size_in_b <= 46.5:
+                self.tp = [4, 8]
+                self.pp = [1, 2, 4]
+                self.mbs = [1, 2, 3, 4]
+                self.min_model_parallel = 4
+                self.max_model_parallel = 16
+                self.gbs = 2048
+            elif model_size_in_b <= 87.5:
+                self.tp = [4, 8]
+                self.pp = [2, 4, 6, 8]
+                self.mbs = [1, 2, 3, 4]
+                self.min_model_parallel = 8
+                self.max_model_parallel = 32
+                self.gbs = 2048
+            elif model_size_in_b <= 165.5:
+                self.tp = [4, 8]
+                self.pp = [4, 6, 8, 16]
+                self.mbs = [2, 4, 6, 8]
+                self.min_model_parallel = 16
+                self.max_model_parallel = 128
+                self.gbs = 2048
+            elif model_size_in_b <= 250.5:
+                self.tp = [8]
+                self.pp = [4, 8, 16, 32]
+                self.mbs = [1, 2, 3, 4]
+                self.min_model_parallel = 32
+                self.max_model_parallel = 256
+                self.gbs = 2048
+            else:
+                raise ValueError("No BERT model larger than 250B parameters is supported.")
+        elif gpu_size == 40:
+            if model_size_in_b <= 1.0:
+                self.tp = [1, 2, 4]
+                self.gbs = 256
+            elif model_size_in_b <= 4.0:
+                self.tp = [1, 2, 4, 8]
+                self.gbs = 1024
+            elif model_size_in_b <= 8.0:
+                self.tp = [2, 4, 8]
+                self.mbs = [1, 2, 4]
+                self.gbs = 2048
+            elif model_size_in_b <= 13.0:
+                self.tp = [2, 4, 8]
+                self.mbs = [1, 2, 4]
+                self.gbs = 2048
+            elif model_size_in_b <= 25.0:
+                self.tp = [2, 4, 8]
+                self.pp = [1, 2]
+                self.mbs = [1, 2, 4]
+                self.min_model_parallel = 2
+                self.max_model_parallel = 16
+                self.gbs = 2048
+            elif model_size_in_b <= 46.5:
+                self.tp = [4, 8]
+                self.pp = [1, 2, 4, 8]
+                self.mbs = [1, 2, 3]
+                self.min_model_parallel = 8
+                self.max_model_parallel = 32
+                self.gbs = 2048
+            elif model_size_in_b <= 87.5:
+                self.tp = [4, 8]
+                self.pp = [2, 4, 6, 8]
+                self.mbs = [1, 2, 3]
+                self.min_model_parallel = 16
+                self.max_model_parallel = 64
+                self.gbs = 2048
+            elif model_size_in_b <= 165.5:
+                self.tp = [8]
+                self.pp = [4, 6, 8, 16]
+                self.mbs = [1, 2]
+                self.min_model_parallel = 32
+                self.max_model_parallel = 256
+                self.gbs = 2048
+            elif model_size_in_b <= 250.5:
+                self.tp = [8]
+                self.pp = [8, 16, 32]
+                self.mbs = [1, 2]
+                self.min_model_parallel = 64
+                self.max_model_parallel = 512
+                self.gbs = 2048
+            else:
+                raise ValueError("No BERT model larger than 250B parameters is supported.")
 
 
 def _calculate_tp_pp_mbs_grid(
@@ -975,59 +949,15 @@ def _calculate_tp_pp_mbs_grid(
         )
         params.init_params()
     elif model_name in ["t5", "mt5"]:
-        if gpu_memory_gb == 80:
-            (
-                tp,
-                pp,
-                cp,
-                ep,
-                mbs,
-                min_model_parallel,
-                max_model_parallel,
-                gbs,
-            ) = _tp_pp_mbs_grid_t5_80gb(
-                model_size_in_b=model_size_in_b, valid_pp=valid_pp, model_measure=model_measure
-            )
-        elif gpu_memory_gb == 40:
-            (
-                tp,
-                pp,
-                cp,
-                ep,
-                mbs,
-                min_model_parallel,
-                max_model_parallel,
-                gbs,
-            ) = _tp_pp_mbs_grid_t5_40gb(
-                model_size_in_b=model_size_in_b, valid_pp=valid_pp, model_measure=model_measure
-            )
+        params = T5GridSearch(
+            model_size_in_b=model_size_in_b, valid_pp=valid_pp, seq_length=seq_length, model_measure=model_measure, gpu_size=gpu_memory_gb
+        )
+        params.init_params()
     elif model_name == "bert":
-        if gpu_memory_gb == 80:
-            (
-                tp,
-                pp,
-                cp,
-                ep,
-                mbs,
-                min_model_parallel,
-                max_model_parallel,
-                gbs,
-            ) = _tp_pp_mbs_grid_bert_80gb(
-                model_size_in_b=model_size_in_b, valid_pp=valid_pp, model_measure=model_measure
-            )
-        elif gpu_memory_gb == 40:
-            (
-                tp,
-                pp,
-                cp,
-                ep,
-                mbs,
-                min_model_parallel,
-                max_model_parallel,
-                gbs,
-            ) = _tp_pp_mbs_grid_bert_40gb(
-                model_size_in_b=model_size_in_b, valid_pp=valid_pp, model_measure=model_measure
-            )
+        params = BertGridSearch(
+            model_size_in_b=model_size_in_b, valid_pp=valid_pp, seq_length=seq_length, model_measure=model_measure, gpu_size=gpu_memory_gb
+        )
+        params.init_params()
     else:
         raise NotImplementedError("Model name not implemented.")
 

From 80054d769ab3be72d764d0b8a8b98743eb4640aa Mon Sep 17 00:00:00 2001
From: dimapihtar <dimapihtar@users.noreply.github.com>
Date: Tue, 27 Aug 2024 19:14:31 +0000
Subject: [PATCH 13/63] Apply isort and black reformatting

Signed-off-by: dimapihtar <dimapihtar@users.noreply.github.com>
---
 .../auto_configurator/core/training_config.py    | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/nemo/collections/llm/tools/auto_configurator/core/training_config.py b/nemo/collections/llm/tools/auto_configurator/core/training_config.py
index 052397682fb3..be20571df574 100644
--- a/nemo/collections/llm/tools/auto_configurator/core/training_config.py
+++ b/nemo/collections/llm/tools/auto_configurator/core/training_config.py
@@ -626,7 +626,7 @@ class T5GridSearch:
     cp = [None]
     ep = [None]
     mbs = [1, 2, 4, 6, 8, 12, 16]
-    
+
     gbs: int = 1920
     min_model_parallel: int = 1
     max_model_parallel: int = 8
@@ -775,7 +775,7 @@ class BertGridSearch:
     cp = [None]
     ep = [None]
     mbs = [1, 2, 4, 6, 8, 12, 16]
-    
+
     gbs: int = 1920
     min_model_parallel: int = 1
     max_model_parallel: int = 8
@@ -950,12 +950,20 @@ def _calculate_tp_pp_mbs_grid(
         params.init_params()
     elif model_name in ["t5", "mt5"]:
         params = T5GridSearch(
-            model_size_in_b=model_size_in_b, valid_pp=valid_pp, seq_length=seq_length, model_measure=model_measure, gpu_size=gpu_memory_gb
+            model_size_in_b=model_size_in_b,
+            valid_pp=valid_pp,
+            seq_length=seq_length,
+            model_measure=model_measure,
+            gpu_size=gpu_memory_gb,
         )
         params.init_params()
     elif model_name == "bert":
         params = BertGridSearch(
-            model_size_in_b=model_size_in_b, valid_pp=valid_pp, seq_length=seq_length, model_measure=model_measure, gpu_size=gpu_memory_gb
+            model_size_in_b=model_size_in_b,
+            valid_pp=valid_pp,
+            seq_length=seq_length,
+            model_measure=model_measure,
+            gpu_size=gpu_memory_gb,
         )
         params.init_params()
     else:

From 227a7384a720c80a127c54682c5f7ed964c7e9f3 Mon Sep 17 00:00:00 2001
From: dimapihtar <dpihtar@gmail.com>
Date: Tue, 27 Aug 2024 12:20:44 -0700
Subject: [PATCH 14/63] fix dataclasses usage

Signed-off-by: dimapihtar <dpihtar@gmail.com>
---
 .../auto_configurator/core/training_config.py | 62 ++++++++-----------
 1 file changed, 26 insertions(+), 36 deletions(-)

diff --git a/nemo/collections/llm/tools/auto_configurator/core/training_config.py b/nemo/collections/llm/tools/auto_configurator/core/training_config.py
index be20571df574..24dc3f0f5ec4 100644
--- a/nemo/collections/llm/tools/auto_configurator/core/training_config.py
+++ b/nemo/collections/llm/tools/auto_configurator/core/training_config.py
@@ -294,7 +294,7 @@ class GPT3GridSearch:
 
     model_size_in_b: int
     seq_length: int
-    gpu_size: int
+    gpu_memory_gb: int
     valid_pp: List[int]
     model_measure: str
 
@@ -310,10 +310,10 @@ class GPT3GridSearch:
 
     def init_params(self):
         model_size_in_b = self.model_size_in_b / 1000 if self.model_measure == "M" else self.model_size_in_b
-        gpu_size = self.gpu_size
+        gpu_memory_gb = self.gpu_memory_gb
         seq_length = self.seq_length
 
-        if gpu_size == 80:
+        if gpu_memory_gb == 80:
             if seq_length == 2048:
                 if model_size_in_b <= 1.0:
                     self.tp = [1, 2]
@@ -517,7 +517,7 @@ def init_params(self):
                     self.min_model_parallel = 16
                     self.max_model_parallel = 32
                     self.gbs = 64
-        elif gpu_size == 40:
+        elif gpu_memory_gb == 40:
             if model_size_in_b <= 1.0:
                 self.tp = [1, 2, 4]
                 self.mbs = [1, 2, 4, 8]
@@ -617,7 +617,7 @@ class T5GridSearch:
 
     model_size_in_b: int
     seq_length: int
-    gpu_size: int
+    gpu_memory_gb: int
     valid_pp: List[int]
     model_measure: str
 
@@ -633,10 +633,10 @@ class T5GridSearch:
 
     def init_params(self):
         model_size_in_b = self.model_size_in_b / 1000 if self.model_measure == "M" else self.model_size_in_b
-        gpu_size = self.gpu_size
+        gpu_memory_gb = self.gpu_memory_gb
         seq_length = self.seq_length
 
-        if gpu_size == 80:
+        if gpu_memory_gb == 80:
             if model_size_in_b <= 1.0:
                 self.tp = [1, 2]
                 self.mbs = [16, 32, 64, 128]
@@ -688,7 +688,7 @@ def init_params(self):
                 self.min_model_parallel = 64
                 self.max_model_parallel = 256
                 self.gbs = 1920
-        elif gpu_size == 40:
+        elif gpu_memory_gb == 40:
             if model_size_in_b <= 1.0:
                 self.tp = [1, 2]
                 self.mbs = [16, 32, 64, 128]
@@ -766,7 +766,7 @@ class BertGridSearch:
 
     model_size_in_b: int
     seq_length: int
-    gpu_size: int
+    gpu_memory_gb: int
     valid_pp: List[int]
     model_measure: str
 
@@ -782,10 +782,10 @@ class BertGridSearch:
 
     def init_params(self):
         model_size_in_b = self.model_size_in_b / 1000 if self.model_measure == "M" else self.model_size_in_b
-        gpu_size = self.gpu_size
+        gpu_memory_gb = self.gpu_memory_gb
         seq_length = self.seq_length
 
-        if gpu_size == 80:
+        if gpu_memory_gb == 80:
             if model_size_in_b <= 1.0:
                 self.tp = [1, 2]
                 self.gbs = 256
@@ -836,7 +836,7 @@ def init_params(self):
                 self.gbs = 2048
             else:
                 raise ValueError("No BERT model larger than 250B parameters is supported.")
-        elif gpu_size == 40:
+        elif gpu_memory_gb == 40:
             if model_size_in_b <= 1.0:
                 self.tp = [1, 2, 4]
                 self.gbs = 256
@@ -939,35 +939,25 @@ def _calculate_tp_pp_mbs_grid(
         multiplier * x for x in range(1, num_layers + 1) if num_layers % x == 0
     ]  # Only divisors of num_layers are possible.
 
+    kwargs = {
+        "model_size_in_b": model_size_in_b,
+        "valid_pp": valid_pp,
+        "seq_length": seq_length,
+        "model_measure": model_measure,
+        "gpu_memory_gb": gpu_memory_gb,
+    }
+
     if model_name in ["gpt3", "llama", "baichuan2", "chatglm", "qwen2", "mixtral", "mistral", "gemma"]:
-        params = GPT3GridSearch(
-            model_size_in_b=model_size_in_b,
-            valid_pp=valid_pp,
-            seq_length=seq_length,
-            model_measure=model_measure,
-            gpu_size=gpu_memory_gb,
-        )
-        params.init_params()
+        search_class = GPT3GridSearch
     elif model_name in ["t5", "mt5"]:
-        params = T5GridSearch(
-            model_size_in_b=model_size_in_b,
-            valid_pp=valid_pp,
-            seq_length=seq_length,
-            model_measure=model_measure,
-            gpu_size=gpu_memory_gb,
-        )
-        params.init_params()
+        search_class = T5GridSearch
     elif model_name == "bert":
-        params = BertGridSearch(
-            model_size_in_b=model_size_in_b,
-            valid_pp=valid_pp,
-            seq_length=seq_length,
-            model_measure=model_measure,
-            gpu_size=gpu_memory_gb,
-        )
-        params.init_params()
+        search_class = BertGridSearch
     else:
         raise NotImplementedError("Model name not implemented.")
+    
+    params = search_class(**kwargs)
+    params.init_params()
 
     # Override the tp, pp, mbs search if indicated in the config params.
     if tp_sizes is not None and tp_sizes != "auto":

From d0acbca4f9ccb4e61134ef6ca9e9447021dde195 Mon Sep 17 00:00:00 2001
From: dimapihtar <dimapihtar@users.noreply.github.com>
Date: Tue, 27 Aug 2024 19:21:33 +0000
Subject: [PATCH 15/63] Apply isort and black reformatting

Signed-off-by: dimapihtar <dimapihtar@users.noreply.github.com>
---
 .../llm/tools/auto_configurator/core/training_config.py         | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/nemo/collections/llm/tools/auto_configurator/core/training_config.py b/nemo/collections/llm/tools/auto_configurator/core/training_config.py
index 24dc3f0f5ec4..aaf87403c6b1 100644
--- a/nemo/collections/llm/tools/auto_configurator/core/training_config.py
+++ b/nemo/collections/llm/tools/auto_configurator/core/training_config.py
@@ -955,7 +955,7 @@ def _calculate_tp_pp_mbs_grid(
         search_class = BertGridSearch
     else:
         raise NotImplementedError("Model name not implemented.")
-    
+
     params = search_class(**kwargs)
     params.init_params()
 

From 9a26476d38d8a148030140f3614667d966afc85d Mon Sep 17 00:00:00 2001
From: dimapihtar <dpihtar@gmail.com>
Date: Wed, 28 Aug 2024 03:18:32 -0700
Subject: [PATCH 16/63] remove unused imports

Signed-off-by: dimapihtar <dpihtar@gmail.com>
---
 .../llm/tools/auto_configurator/base_configs/basic.py          | 1 -
 .../llm/tools/auto_configurator/base_configs/custom.py         | 3 ---
 .../llm/tools/auto_configurator/base_configs/gemma.py          | 2 --
 .../llm/tools/auto_configurator/base_configs/gpt.py            | 3 ---
 .../llm/tools/auto_configurator/base_configs/llama.py          | 2 --
 .../llm/tools/auto_configurator/base_configs/mistral.py        | 3 ---
 .../llm/tools/auto_configurator/base_configs/mixtral.py        | 3 ---
 7 files changed, 17 deletions(-)

diff --git a/nemo/collections/llm/tools/auto_configurator/base_configs/basic.py b/nemo/collections/llm/tools/auto_configurator/base_configs/basic.py
index 616629a876f4..f58fd4a1eecf 100644
--- a/nemo/collections/llm/tools/auto_configurator/base_configs/basic.py
+++ b/nemo/collections/llm/tools/auto_configurator/base_configs/basic.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import numpy as np
 from megatron.core.optimizer import OptimizerConfig
 
 from nemo.collections.llm.utils import Config
diff --git a/nemo/collections/llm/tools/auto_configurator/base_configs/custom.py b/nemo/collections/llm/tools/auto_configurator/base_configs/custom.py
index 7f8a283fcaaa..a66254a320c6 100644
--- a/nemo/collections/llm/tools/auto_configurator/base_configs/custom.py
+++ b/nemo/collections/llm/tools/auto_configurator/base_configs/custom.py
@@ -12,9 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import copy
-import os
-
 from nemo.collections.llm.tools.auto_configurator import base_configs
 
 from .basic import Basic
diff --git a/nemo/collections/llm/tools/auto_configurator/base_configs/gemma.py b/nemo/collections/llm/tools/auto_configurator/base_configs/gemma.py
index d4a2665adf56..e332c68c4afa 100644
--- a/nemo/collections/llm/tools/auto_configurator/base_configs/gemma.py
+++ b/nemo/collections/llm/tools/auto_configurator/base_configs/gemma.py
@@ -12,8 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import copy
-import os
 import torch
 
 from nemo.collections import llm
diff --git a/nemo/collections/llm/tools/auto_configurator/base_configs/gpt.py b/nemo/collections/llm/tools/auto_configurator/base_configs/gpt.py
index ef4082ac9c87..5f8e47628bc9 100644
--- a/nemo/collections/llm/tools/auto_configurator/base_configs/gpt.py
+++ b/nemo/collections/llm/tools/auto_configurator/base_configs/gpt.py
@@ -12,9 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import copy
-import os
-
 from nemo.collections import llm
 from nemo.collections.llm.utils import Config
 
diff --git a/nemo/collections/llm/tools/auto_configurator/base_configs/llama.py b/nemo/collections/llm/tools/auto_configurator/base_configs/llama.py
index 3676749be653..7f57fbe2383e 100644
--- a/nemo/collections/llm/tools/auto_configurator/base_configs/llama.py
+++ b/nemo/collections/llm/tools/auto_configurator/base_configs/llama.py
@@ -12,8 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import copy
-import os
 import torch
 
 from nemo.collections import llm
diff --git a/nemo/collections/llm/tools/auto_configurator/base_configs/mistral.py b/nemo/collections/llm/tools/auto_configurator/base_configs/mistral.py
index 830a754c8730..6972cee1107f 100644
--- a/nemo/collections/llm/tools/auto_configurator/base_configs/mistral.py
+++ b/nemo/collections/llm/tools/auto_configurator/base_configs/mistral.py
@@ -12,9 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import copy
-import os
-
 from nemo.collections import llm
 from nemo.collections.llm.utils import Config
 
diff --git a/nemo/collections/llm/tools/auto_configurator/base_configs/mixtral.py b/nemo/collections/llm/tools/auto_configurator/base_configs/mixtral.py
index ae08e9ef5f05..b0d65849a4fc 100644
--- a/nemo/collections/llm/tools/auto_configurator/base_configs/mixtral.py
+++ b/nemo/collections/llm/tools/auto_configurator/base_configs/mixtral.py
@@ -12,9 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import copy
-import os
-
 from nemo.collections import llm
 from nemo.collections.llm.utils import Config
 

From 131503112d2f4d6866b7dca94b448b9524877fa9 Mon Sep 17 00:00:00 2001
From: dimapihtar <dpihtar@gmail.com>
Date: Wed, 28 Aug 2024 03:24:45 -0700
Subject: [PATCH 17/63] remove extra function

Signed-off-by: dimapihtar <dpihtar@gmail.com>
---
 .../auto_configurator/core/search_config.py     |  4 ++--
 .../auto_configurator/core/training_config.py   | 17 -----------------
 2 files changed, 2 insertions(+), 19 deletions(-)

diff --git a/nemo/collections/llm/tools/auto_configurator/core/search_config.py b/nemo/collections/llm/tools/auto_configurator/core/search_config.py
index 03ea6ca8c74c..0fd2492b89d8 100644
--- a/nemo/collections/llm/tools/auto_configurator/core/search_config.py
+++ b/nemo/collections/llm/tools/auto_configurator/core/search_config.py
@@ -15,7 +15,7 @@
 import os
 from typing import Optional
 
-from nemo.collections.llm.tools.auto_configurator.core.training_config import search_training_config
+from nemo.collections.llm.tools.auto_configurator.core.training_config import generate_grid_search_configs
 from nemo.collections.llm.tools.auto_configurator.core.utils import generic_base_config
 
 SUPPORTED_MODELS = [
@@ -79,6 +79,6 @@ def search_configs(cfg: dict):
     )
 
     # Launch grid search for training constraints
-    configs = search_training_config(base_cfg, train_cfg)
+    configs = generate_grid_search_configs(base_cfg, train_cfg)
 
     return configs
diff --git a/nemo/collections/llm/tools/auto_configurator/core/training_config.py b/nemo/collections/llm/tools/auto_configurator/core/training_config.py
index aaf87403c6b1..f2a507a9a386 100644
--- a/nemo/collections/llm/tools/auto_configurator/core/training_config.py
+++ b/nemo/collections/llm/tools/auto_configurator/core/training_config.py
@@ -23,23 +23,6 @@
 from nemo.collections.llm.tools.auto_configurator.core import utils
 
 
-def search_training_config(
-    base_cfg: dict,
-    train_cfg: dict,
-) -> None:
-    """
-    Entry point for the Auto Configurator search. This function calls other functions
-    to generate the grid of possible configurations.
-    :param dict base_cfg: base configuration of the model to be trained.
-    :param dict base_cfg: config of the model that will be launched.
-    :return: dict with generated configs.
-    """
-    # Generate candidate configs.
-    configs = generate_grid_search_configs(base_cfg, train_cfg)
-
-    return configs
-
-
 def generate_grid_search_configs(
     base_cfg: dict,
     train_cfg: dict,

From 1aafc2016856e9b5eeeb4068a77598a9a9cee014 Mon Sep 17 00:00:00 2001
From: dimapihtar <dpihtar@gmail.com>
Date: Wed, 28 Aug 2024 05:17:41 -0700
Subject: [PATCH 18/63] fix docstring style

Signed-off-by: dimapihtar <dpihtar@gmail.com>
---
 .../auto_configurator/base_configs/basic.py   |  43 +-
 .../auto_configurator/base_configs/custom.py  |  18 +-
 .../auto_configurator/base_configs/gemma.py   |  19 +-
 .../auto_configurator/base_configs/gpt.py     |  19 +-
 .../auto_configurator/base_configs/llama.py   |  19 +-
 .../auto_configurator/base_configs/mistral.py |  19 +-
 .../auto_configurator/base_configs/mixtral.py |  19 +-
 .../auto_configurator/core/base_config.py     | 114 ++--
 .../core/calculate_performance.py             |  66 +-
 .../auto_configurator/core/training_config.py | 142 ++---
 .../llm/tools/auto_configurator/core/utils.py | 582 +++++++++---------
 .../llm/tools/auto_configurator/runner.py     |  87 +--
 12 files changed, 573 insertions(+), 574 deletions(-)

diff --git a/nemo/collections/llm/tools/auto_configurator/base_configs/basic.py b/nemo/collections/llm/tools/auto_configurator/base_configs/basic.py
index f58fd4a1eecf..4044c1f8bd10 100644
--- a/nemo/collections/llm/tools/auto_configurator/base_configs/basic.py
+++ b/nemo/collections/llm/tools/auto_configurator/base_configs/basic.py
@@ -27,11 +27,12 @@ def __init__(
         cfg: dict = {},
     ):
         """
-        :param str name: model name.
-        :param int version: model version.
-        :param int size: model size.
-        :param str measure: meausre of model size. "M" if model size in millions, "B" if in billions.
-        :param dict cfg: auto configurator runner config.
+        Args:
+            name (str): model name.
+            version (int): model version.
+            size (int):  model size.
+            measure (str): meausre of model size. "M" if model size in millions, "B" if in billions.
+            cfg (dict): auto configurator runner config.
         """
 
         self.name = name
@@ -55,10 +56,10 @@ def model_config(self):
         None
 
     def get_optim_config(self) -> OptimizerConfig:
-        """
-        Function that returns optimizer config.
-        :return: optim config.
-        :rtype: OptimizerConfig.
+        """Function that returns optimizer config.
+
+        Returns:
+            OptimizerConfig: optimizer config.
         """
         optim_params = {
             "optimizer": "adam",
@@ -85,10 +86,10 @@ def get_optim_config(self) -> OptimizerConfig:
         return optim_config
 
     def get_trainer_config(self) -> dict:
-        """
-        Function that returns config for PTL trainer.
-        :return: trainer config.
-        :rtype: dict.
+        """Function that returns config for PTL trainer.
+
+        Returns:
+            dict: trainer config.
         """
 
         trainer_config = {
@@ -110,10 +111,10 @@ def get_trainer_config(self) -> dict:
         return trainer_config
 
     def get_data_config(self) -> dict:
-        """
-        Function that returns dataset config.
-        :return: data config.
-        :rtype: dict.
+        """Function that returns dataset config.
+
+        Returns:
+            dict: data config.
         """
 
         data_config = {
@@ -128,10 +129,10 @@ def get_data_config(self) -> dict:
         return data_config
 
     def get_run_config(self) -> dict:
-        """
-        Function that returns config for cluster job.
-        :return: cluster job config.
-        :rtype: dict.
+        """Function that returns config for cluster job.
+
+        Returns:
+            dict: cluster job config.
         """
 
         run_config = {
diff --git a/nemo/collections/llm/tools/auto_configurator/base_configs/custom.py b/nemo/collections/llm/tools/auto_configurator/base_configs/custom.py
index a66254a320c6..9bcb6ef45777 100644
--- a/nemo/collections/llm/tools/auto_configurator/base_configs/custom.py
+++ b/nemo/collections/llm/tools/auto_configurator/base_configs/custom.py
@@ -18,18 +18,24 @@
 
 
 def custom(name, cfg):
+    """Function that return custom model class.
+
+    Args:
+        name (srt): model type.
+        cfg (dict): auto configurator runner config.
+
+    Returns
+        Custom: class object.
     """
-    Function taht return custom model class.
-    :param dict cfg: auto configurator runner config.
-    :return: Custom class object.
-    """
+
     basic_class = getattr(base_configs, name)
 
     class Custom(basic_class):
         def __init__(self, name, cfg):
             """
-            :param str name: model name.
-            :param dict cfg: auto configurator runner config.
+            Args:
+                name (srt): model type.
+                cfg (dict): auto configurator runner config.
             """
 
             super().__init__(name=name, cfg=cfg)
diff --git a/nemo/collections/llm/tools/auto_configurator/base_configs/gemma.py b/nemo/collections/llm/tools/auto_configurator/base_configs/gemma.py
index e332c68c4afa..dfe774441161 100644
--- a/nemo/collections/llm/tools/auto_configurator/base_configs/gemma.py
+++ b/nemo/collections/llm/tools/auto_configurator/base_configs/gemma.py
@@ -30,21 +30,22 @@ def __init__(
         cfg: dict = {},
     ):
         """
-        :param str name: model name.
-        :param int version: model version.
-        :param int size: model size.
-        :param str measure: meausre of model size. "M" if model size in millions, "B" if in billions.
-        :param dict cfg: auto configurator runner config.
+        Args:
+            name (str): model name.
+            version (int): model version.
+            size (int): model size.
+            measure (str): meausre of model size. "M" if model size in millions, "B" if in billions.
+            cfg (dict): auto configurator runner config.
         """
 
         super().__init__(name=name, version=version, size=size, measure=measure, cfg=cfg)
         self.config_name = f"{self.name}Config{self.size}{self.measure}"
 
     def get_model_config(self) -> Config:
-        """
-        Function that returns model config.
-        :return: model config.
-        :rtype: Config.
+        """Function that returns model config.
+
+        Returns:
+            Config: model config.
         """
 
         model_class = getattr(llm, self.config_name)
diff --git a/nemo/collections/llm/tools/auto_configurator/base_configs/gpt.py b/nemo/collections/llm/tools/auto_configurator/base_configs/gpt.py
index 5f8e47628bc9..aa8f184abf01 100644
--- a/nemo/collections/llm/tools/auto_configurator/base_configs/gpt.py
+++ b/nemo/collections/llm/tools/auto_configurator/base_configs/gpt.py
@@ -28,21 +28,22 @@ def __init__(
         cfg: dict = {},
     ):
         """
-        :param str name: model name.
-        :param int version: model version.
-        :param int size: model size.
-        :param str measure: meausre of model size. "M" if model size in millions, "B" if in billions.
-        :param dict cfg: auto configurator runner config.
+        Args:
+            name (str): model name.
+            version (int): model version.
+            size (int): model size.
+            measure (str): meausre of model size. "M" if model size in millions, "B" if in billions.
+            cfg (dict): auto configurator runner config.
         """
 
         super().__init__(name=name, version=version, size=size, measure=measure, cfg=cfg)
         self.config_name = f"{self.name}Config{self.size}{self.measure}"
 
     def get_model_config(self) -> Config:
-        """
-        Function that returns model config.
-        :return: model config.
-        :rtype: Config.
+        """Function that returns model config.
+
+        Returns:
+            Config: model config.
         """
 
         model_class = getattr(llm, self.config_name)
diff --git a/nemo/collections/llm/tools/auto_configurator/base_configs/llama.py b/nemo/collections/llm/tools/auto_configurator/base_configs/llama.py
index 7f57fbe2383e..610e89480798 100644
--- a/nemo/collections/llm/tools/auto_configurator/base_configs/llama.py
+++ b/nemo/collections/llm/tools/auto_configurator/base_configs/llama.py
@@ -30,21 +30,22 @@ def __init__(
         cfg: dict = {},
     ):
         """
-        :param str name: model name.
-        :param int version: model version.
-        :param int size: model size.
-        :param str measure: meausre of model size. "M" if model size in millions, "B" if in billions.
-        :param dict cfg: auto configurator runner config.
+        Args:
+            name (str): model name.
+            version (int): model version.
+            size (int): model size.
+            measure (str): meausre of model size. "M" if model size in millions, "B" if in billions.
+            cfg (dict): auto configurator runner config.
         """
 
         super().__init__(name=name, version=version, size=size, measure=measure, cfg=cfg)
         self.config_name = f"{self.name}{self.version}Config{self.size}{self.measure}"
 
     def get_model_config(self) -> Config:
-        """
-        Function that returns model config.
-        :return: model config.
-        :rtype: Config.
+        """Function that returns model config.
+
+        Returns:
+            Config: model config.
         """
 
         model_class = getattr(llm, self.config_name)
diff --git a/nemo/collections/llm/tools/auto_configurator/base_configs/mistral.py b/nemo/collections/llm/tools/auto_configurator/base_configs/mistral.py
index 6972cee1107f..c35c7ecab4b4 100644
--- a/nemo/collections/llm/tools/auto_configurator/base_configs/mistral.py
+++ b/nemo/collections/llm/tools/auto_configurator/base_configs/mistral.py
@@ -28,21 +28,22 @@ def __init__(
         cfg: dict = {},
     ):
         """
-        :param str name: model name.
-        :param int version: model version.
-        :param int size: model size.
-        :param str measure: meausre of model size. "M" if model size in millions, "B" if in billions.
-        :param dict cfg: auto configurator runner config.
+        Args:
+            name (str): model name.
+            version (int): model version.
+            size (int): model size.
+            measure (str): meausre of model size. "M" if model size in millions, "B" if in billions.
+            cfg (dict): auto configurator runner config.
         """
 
         super().__init__(name=name, version=version, size=size, measure=measure, cfg=cfg)
         self.config_name = f"{self.name}Config{self.size}{self.measure}"
 
     def get_model_config(self) -> Config:
-        """
-        Function that returns model config.
-        :return: model config.
-        :rtype: Config.
+        """Function that returns model config.
+
+        Returns:
+            Config: model config.
         """
 
         model_class = getattr(llm, self.config_name)
diff --git a/nemo/collections/llm/tools/auto_configurator/base_configs/mixtral.py b/nemo/collections/llm/tools/auto_configurator/base_configs/mixtral.py
index b0d65849a4fc..6ad0adb38e38 100644
--- a/nemo/collections/llm/tools/auto_configurator/base_configs/mixtral.py
+++ b/nemo/collections/llm/tools/auto_configurator/base_configs/mixtral.py
@@ -28,21 +28,22 @@ def __init__(
         cfg: dict = {},
     ):
         """
-        :param str name: model name.
-        :param int version: model version.
-        :param int size: model size.
-        :param str measure: meausre of model size. "M" if model size in millions, "B" if in billions.
-        :param dict cfg: auto configurator runner config.
+        Args:
+            name (str): model name.
+            version (int): model version.
+            size (int): model size.
+            measure (str): meausre of model size. "M" if model size in millions, "B" if in billions.
+            cfg (dict): auto configurator runner config.
         """
 
         super().__init__(name=name, version=version, size=size, measure=measure, cfg=cfg)
         self.config_name = f"{self.name}Config{self.version}x{self.size}{self.measure}"
 
     def get_model_config(self) -> Config:
-        """
-        Function that returns model config.
-        :return: model config.
-        :rtype: Config.
+        """Function that returns model config.
+
+        Returns:
+            Config: model config.
         """
 
         model_class = getattr(llm, self.config_name)
diff --git a/nemo/collections/llm/tools/auto_configurator/core/base_config.py b/nemo/collections/llm/tools/auto_configurator/core/base_config.py
index 1659cdfd2ecb..5d172a6df29b 100644
--- a/nemo/collections/llm/tools/auto_configurator/core/base_config.py
+++ b/nemo/collections/llm/tools/auto_configurator/core/base_config.py
@@ -12,8 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-"""Generates base configuration for given model."""
-
 import math
 import os
 from typing import Tuple
@@ -29,19 +27,24 @@ def calculate_model_size(
     num_tokens_in_b: int = 300,
     model_name: str = "gpt3",
 ) -> float:
+    """Estimates a model size to be trained given the constraints. If the
+       model_size is provided, it estimates the time to train it with the given
+       constraints.
+    
+    Example:
+        output 5B params to train for 7 days with 160 GPUs.
+    
+    Args:
+        gpu_count (int): number of gpus to use (num_nodes * gpus_per_node).
+        max_training_days (float): number of days to train the model for.
+        model_size_in_b (float): number of parameters in the model, if known.
+        tflops_per_gpu (int): estimated number of TFLOPS/s per GPU.
+        num_tokens_in_b (int): number of tokens to train the model for.
+    
+    Returns:
+        float: number of parameters to use for training.
     """
-    Estimates a model size to be trained given the constraints. If the
-    model_size is provided, it estimates the time to train it with the given
-    constraints.
-    Example: output 5B params to train for 7 days with 160 GPUs.
-    :param int gpu_count: number of gpus to use (num_nodes * gpus_per_node).
-    :param float max_training_days: number of days to train the model for.
-    :param float model_size_in_b: number of parameters in the model, if known.
-    :param int tflops_per_gpu: estimated number of TFLOPS/s per GPU.
-    :param int num_tokens_in_b: number of tokens to train the model for.
-    :return: number of parameters to use for training.
-    :rtype: float
-    """
+
     # Model size is not known, must be estimated.
     if model_size_in_b is None:
         model_size_in_b = _estimate_model_size(
@@ -77,18 +80,22 @@ def _estimate_model_size(
     num_tokens_in_b: int,
     model_name: str,
 ) -> float:
+    """Estimates model size given time and hardware constraints. It's only used if the model size is not provided by the user.
+
+    Args:
+        max_training_days (float): number of days to train the model for.
+        gpu_count (int): number of gpus to use (num_nodes * gpus_per_node).
+        tflops_per_gpu (int): estimated number of TFLOPS/s per GPU.
+        num_tokens_in_b (int): number of tokens to train the model for.
+        model_name (str): name of the model, such as gpt3, t5, mt5...
+    
+    Returns:
+        float: number of parameters to use for training.
+    
+    Raises:
+        NotImplementedError: if the model_name is not one of the supported models.
     """
-    Estimates model size given time and hardware constraints. It's only used if the model size is
-    not provided by the user.
-    :param float max_training_days: number of days to train the model for.
-    :param int gpu_count: number of gpus to use (num_nodes * gpus_per_node).
-    :param int tflops_per_gpu: estimated number of TFLOPS/s per GPU.
-    :param int num_tokens_in_b: number of tokens to train the model for.
-    :param str model_name: name of the model, such as gpt3, t5, mt5...
-    :return: number of parameters to use for training.
-    :rtype: float
-    :raises NotImplementedError: if the model_name is not one of the supported models.
-    """
+
     model_penalty = 0.87 if model_name == "mt5" else 1.0
     valid_models = ["gpt3", "t5", "mt5", "bert", "llama", "mixtral", "mistral", "gemma"]
     try:
@@ -118,18 +125,22 @@ def _estimate_training_time(
     num_tokens_in_b: int,
     model_name: str,
 ) -> float:
+    """Estimates training time for a given model size and hardware constraint. To be used when a model size is provided by the user.
+
+    Args:
+        model_size_in_b (float): number of parameters to use for training.
+        gpu_count (int): number of gpus to use (num_nodes * gpus_per_node).
+        tflops_per_gpu (int): estimated number of TFLOPS/s per GPU.
+        num_tokens_in_b (int): number of tokens to train the model for.
+        model_name (str): name of the model, such as gpt3, t5, mt5...
+
+    Returns:
+        float: number of days it will take to train the model.
+    
+    Raises:
+        NotImplementedError: if the model_name is not one of the supported models.
     """
-    Estimates training time for a given model size and hardware constraint. To be used when
-    a model size is provided by the user.
-    :param float model_size_in_b: number of parameters to use for training.
-    :param int gpu_count: number of gpus to use (num_nodes * gpus_per_node).
-    :param int tflops_per_gpu: estimated number of TFLOPS/s per GPU.
-    :param int num_tokens_in_b: number of tokens to train the model for.
-    :param str model_name: name of the model, such as gpt3, t5, mt5...
-    :return: number of days it will take to train the model.
-    :rtype: float
-    :raises NotImplementedError: if the model_name is not one of the supported models.
-    """
+    
     model_penalty = 1.15 if model_name == "mt5" else 1.0
     valid_models = ["gpt3", "t5", "mt5", "bert", "llama", "mixtral", "mistral", "gemma"]
     try:
@@ -148,33 +159,4 @@ def _estimate_training_time(
         print(f"Cannot divide by zero. This can happen if gpu_count or tflops_per_gpu are zero: {err}")
     except NotImplementedError as err:
         print(f"Training time estimation is only available for {valid_models}: {err}")
-    return None
-
-
-def generate_base_config(
-    model_name: str,
-    model_version: int,
-    model_size_in_b: int,
-    model_measure: str,
-    cfg: dict,
-):
-    """
-    Generates base config dictionary for a given model name and size.
-    :param str model_name: name of the model, such as gpt3, t5, mt5...
-    :param int model_version: version of model to be trained.
-    :param float model_size_in_b: number of parameters in the model, if known.
-    :param str model_measure: measure of model size (millions or billions).
-    :param dict cfg: full config object.
-    :return: base config object for the given model.
-    :rtype: dict
-    """
-
-    base_cfg = generic_base_config(
-        model_name=model_name,
-        model_version=model_version,
-        model_size_in_b=model_size_in_b,
-        model_measure=model_measure,
-        cfg=cfg,
-    )
-
-    return base_cfg
+    return None
\ No newline at end of file
diff --git a/nemo/collections/llm/tools/auto_configurator/core/calculate_performance.py b/nemo/collections/llm/tools/auto_configurator/core/calculate_performance.py
index d4d9c84dd204..32f392a2f8da 100644
--- a/nemo/collections/llm/tools/auto_configurator/core/calculate_performance.py
+++ b/nemo/collections/llm/tools/auto_configurator/core/calculate_performance.py
@@ -41,24 +41,27 @@ def get_results(
     custom_model: Optional[bool] = False,
     output_top_n: Optional[int] = 10,
 ):
+    """Generates possible train configs.
+
+    Args:
+        training_logs (str): path to the dicrectory with training logs.
+        path_to_save (str): path where to save performance results.
+        model_name (str): model name used for auto conf search.
+        num_nodes (int): number of nodes used for auto conf search.
+        model_version (int): version of model. 3 for GPT3, 2 for Llama2.
+        seq_length (int): model sequence length.
+        global_batch_size (int): model global batch size.
+        vocab_size (int): size of tokenizer vocabulary.
+        model_size (Optional[int]): size of model used for auto conf search.
+        model_measure (Optional[str]): "M" if model_size is specified in millions. "B" if in billions.
+        gpus_per_node (Optional[int]): number of GPUs per node used for auto conf search.
+        max_training_days (Optional[int]): number of days expected model to be trained.
+        tflops_per_gpu (Optional[int]): estimated tflops per GPU.
+        num_tokens_in_b (Optional[int]): number of tokens in billions in train dataset.
+        custom_model (Optional[bool]): set to True if custom model was used.
+        output_top_n (Optional[int]): Number of configs to be printed out as best configs.
     """
-    :param str training_logs: path to the dicrectory with training logs.
-    :param str path_to_save: path where to save performance results.
-    :param str model_name: model name used for auto conf search.
-    :param int num_nodes: number of nodes used for auto conf search.
-    :param int model_version: version of model. 3 for GPT3, 2 for Llama2.
-    :param int seq_length: model sequence length.
-    :param int global_batch_size: model global batch size.
-    :param int vocab_size: size of tokenizer vocabulary.
-    :param Optional[int] model_size: size of model used for auto conf search.
-    :param Optional[str] model_measure: "M" if model_size is specified in millions. "B" if in billions.
-    :param Optional[int] gpus_per_node: number of GPUs per node used for auto conf search.
-    :param Optional[int] max_training_days: number of days expected model to be trained.
-    :param Optional[int] tflops_per_gpu: estimated tflops per GPU.
-    :param Optional[int] num_tokens_in_b: number of tokens in billions in train dataset.
-    :param Optional[bool] custom_model: set to True if custom model was used.
-    :param Optional[int] output_top_n: Number of configs to be printed out as best configs.
-    """
+    
     # Get model architecture
     cfg = locals()
     cfg["gpu_count"] = num_nodes * gpus_per_node
@@ -250,13 +253,15 @@ def calculate_tflops(
     time_per_step,
 ):
     """Calculates model and hardware TFLOPS for each model.
-    GPT-3 Formulas:
-        Model FLOPs = (24𝐵𝑠ℎ^2 + 4𝐵��^2ℎ) x (3 x num_layers) + 6𝐵𝑠ℎ
-    T5/mT5 Formula:
-        Model FLOPs =
-    Bert Formula:
-        Model FLOPs = 72BLsh^2 * ( 1 + (s/6h) + (v/12hL))
+
+        GPT-3 Formulas:
+            Model FLOPs = (24𝐵𝑠ℎ^2 + 4𝐵��^2ℎ) x (3 x num_layers) + 6𝐵𝑠ℎ
+        T5/mT5 Formula:
+            Model FLOPs =
+        Bert Formula:
+            Model FLOPs = 72BLsh^2 * ( 1 + (s/6h) + (v/12hL))
     """
+
     if model_name in ["gpt3", "llama", "baichuan2", "chatglm", "qwen2", "mixtral"]:
         # Model FLOPS calculation
         model_flops = (
@@ -310,13 +315,16 @@ def calculate_tflops(
 
 
 def find_error(error_file: str, errors: list = ["CUDA out of memory"]):
+    """Finds the error among job output.
+
+    Args:
+        :param list errors: list of "popular" errors.
+        :param str error_file: path to the job output.
+    
+    Returns:
+        str: serror message if job has been failed because of one of listed errors or None if not.
     """
-    Finds the error among job output.
-    :param list errors: list of "popular" errors.
-    :param str error_file: path to the job output.
-    :return: str error if job has been failed because of one of listed errors and None if not.
-    :rtype: str
-    """
+
     error = None
     with open(error_file, "r") as f:
         output = f.read()
diff --git a/nemo/collections/llm/tools/auto_configurator/core/training_config.py b/nemo/collections/llm/tools/auto_configurator/core/training_config.py
index f2a507a9a386..376d96c1012a 100644
--- a/nemo/collections/llm/tools/auto_configurator/core/training_config.py
+++ b/nemo/collections/llm/tools/auto_configurator/core/training_config.py
@@ -12,8 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-"""Generates training configs."""
-
 import os
 import shutil
 import subprocess
@@ -27,12 +25,14 @@ def generate_grid_search_configs(
     base_cfg: dict,
     train_cfg: dict,
 ) -> Tuple[str, List[int], int]:
-    """
-    Generates the grid of all possible configurations for the given model, and stores
-    each different configuration in a yaml file.
-    :param dict base_cfg: base configuration of the model to be trained.
-    :param dict base_cfg: train configuration of the model to be trained.
-    :return: dict with generated configs.
+    """Generates the grid of all possible configurations for the given model, and stores each different configuration in a yaml file.
+
+    Args:
+        base_cfg (dict): base configuration of the model to be trained.
+        train_cfg (dict): train configuration of the model to be trained.
+
+    Returns:
+        dict: generated configs.
     """
 
     model_name = train_cfg.get("model_type")
@@ -257,29 +257,21 @@ def _set_activations_checkpoint_params(
 
 @dataclass
 class GPT3GridSearch:
-    """
-    Selects grid search space for TP, PP, MBS parameters for GPT-3 and 80GB GPUs.
-    :param float model_size_in_b: number of parameters in the model.
-    :param List[int] valid_pp: list of valid Pipeline Parallelism (PP) values for this config.
-    :param int seq length: sequence length to use for training.
-    :param str model_measure: measure of model size (millions or billions).
-    :returns: tuple (tp, pp, cp, ep, mbs, min_model_parallel, max_model_parallel, gbs)
-        WHERE
-        int tp is the Tensor Parallelism value to use for training.
-        int pp is the Pipeline Parallelism value to use for training.
-        int cp is the Context Parallelism value to use for training.
-        int ep is the Expert Parallelism value to use for training.
-        int mbs is the Micro Batch Size to use for training.
-        int min_model_parallel is min Model parallel size to use for training.
-        int max_model_parallel is max Model parallel size to use for training.
-        int gbs is the Global Batch Size to use for training.
+    """Selects grid search space for TP, PP, MBS parameters for GPT-3 and 80GB GPUs.
+
+    Args:
+        model_size_in_b (float): number of parameters in the model.
+        valid_pp (List[int]): list of valid Pipeline Parallelism (PP) values for this config.
+        seq length (int): sequence length to use for training.
+        model_measure (str): measure of model size (millions or billions).
+        gpu_memory_gb (int): size of GPU memory in GB.
     """
 
     model_size_in_b: int
-    seq_length: int
-    gpu_memory_gb: int
     valid_pp: List[int]
+    seq_length: int
     model_measure: str
+    gpu_memory_gb: int
 
     tp = [1, 2, 4, 8]
     pp = [1]
@@ -581,21 +573,14 @@ def init_params(self):
 
 @dataclass
 class T5GridSearch:
-    """
-    Selects grid search space for TP, PP, MBS parameters for T5/mT5 and 80GB GPUs.
-    :param float model_size_in_b: number of parameters in the model.
-    :param List[int] valid_pp: list of valid Pipeline Parallelism (PP) values for this config.
-    :param str model_measure: measure of model size (millions or billions).
-    :returns: tuple (tp, pp, cp, ep, mbs, min_model_parallel, max_model_parallel, gbs)
-        WHERE
-        int tp is the Tensor Parallelism value to use for training.
-        int pp is the Pipeline Parallelism value to use for training.
-        int cp is the Context Parallelism value to use for training.
-        int ep is the Expert Parallelism value to use for training.
-        int mbs is the Micro Batch Size to use for training.
-        int min_model_parallel is min Model parallel size to use for training.
-        int max_model_parallel is max Model parallel size to use for training.
-        int gbs is the Global Batch Size to use for training.
+    """Selects grid search space for TP, PP, MBS parameters for T5/mT5 and 80GB GPUs.
+
+    Args:
+        model_size_in_b (float): number of parameters in the model.
+        valid_pp (List[int]): list of valid Pipeline Parallelism (PP) values for this config.
+        seq length (int): sequence length to use for training.
+        model_measure (str): measure of model size (millions or billions).
+        gpu_memory_gb (int): size of GPU memory in GB.
     """
 
     model_size_in_b: int
@@ -730,21 +715,14 @@ def init_params(self):
 
 @dataclass
 class BertGridSearch:
-    """
-    Selects grid search space for TP, PP, MBS parameters for BERT and 80GB GPUs.
-    :param float model_size_in_b: number of parameters in the model.
-    :param List[int] valid_pp: list of valid Pipeline Parallelism (PP) values for this config.
-    :param str model_measure: measure of model size (millions or billions).
-    :returns: tuple (tp, pp, cp, ep, mbs, min_model_parallel, max_model_parallel, gbs)
-        WHERE
-        int tp is the Tensor Parallelism value to use for training.
-        int pp is the Pipeline Parallelism value to use for training.
-        int cp is the Context Parallelism value to use for training.
-        int ep is the Expert Parallelism value to use for training.
-        int mbs is the Micro Batch Size to use for training.
-        int min_model_parallel is min Model parallel size to use for training.
-        int max_model_parallel is max Model parallel size to use for training.
-        int gbs is the Global Batch Size to use for training.
+    """Selects grid search space for TP, PP, MBS parameters for BERT and 80GB GPUs.
+
+    Args:
+        model_size_in_b (float): number of parameters in the model.
+        valid_pp (List[int]): list of valid Pipeline Parallelism (PP) values for this config.
+        seq length (int): sequence length to use for training.
+        model_measure (str): measure of model size (millions or billions).
+        gpu_memory_gb (int): size of GPU memory in GB.
     """
 
     model_size_in_b: int
@@ -880,26 +858,22 @@ def _calculate_tp_pp_mbs_grid(
     seq_length: int,
     train_cfg: dict,
 ) -> Tuple[int, int, int]:
+    """Selects grid search space for TP, PP, MBS parameters for any model, and calls the necessary heuristics function accordingly.
+
+    Args:
+        model_size_in_b (float): number of parameters in the model.
+        num_layers (int): number of layers in the model config.
+        model_name (str): name of the model to be used, such as gpt3, t5, mt5...
+        seq_length (int): sequence length to use for training.
+        train_cfg (dict): config of the model that will be launched.
+    
+    Returns:
+        dataclass object with model parallelism parameters.
+
+    Raises:
+        NotImplementedError: if the model_name is not one of the supported models.
     """
-    Selects grid search space for TP, PP, MBS parameters for any model, and calls the necessary
-    heuristics function accordingly.
-    :param float model_size_in_b: number of parameters in the model.
-    :param int num_layers: number of layers in the model config.
-    :param str model_name: name of the model to be used, such as gpt3, t5, mt5...
-    :param int seq_length: sequence length to use for training.
-    :param dict train_cfg: config of the model that will be launched.
-    :returns: tuple (tp, pp, cp, ep, mbs, min_model_parallel, max_model_parallel, gbs)
-        WHERE
-        int tp is the Tensor Parallelism value to use for training.
-        int pp is the Pipeline Parallelism value to use for training.
-        int cp is the Context Parallelism value to use for training.
-        int ep is the Expert Parallelism value to use for training.
-        int mbs is the Micro Batch Size to use for training.
-        int min_model_parallel is min Model parallel size to use for training.
-        int max_model_parallel is max Model parallel size to use for training.
-        int gbs is the Global Batch Size to use for training.
-    :raises NotImplementedError: if the model_name is not one of the supported models.
-    """
+
     tp_sizes = train_cfg.get("tensor_parallel_sizes")
     pp_sizes = train_cfg.get("pipeline_parallel_sizes")
     cp_sizes = train_cfg.get("context_parallel_sizes", None)
@@ -944,19 +918,19 @@ def _calculate_tp_pp_mbs_grid(
 
     # Override the tp, pp, mbs search if indicated in the config params.
     if tp_sizes is not None and tp_sizes != "auto":
-        tp = tp_sizes
+        params.tp = tp_sizes
     if pp_sizes is not None and pp_sizes != "auto":
-        pp = pp_sizes
+        params.pp = pp_sizes
     if cp_sizes is not None and cp_sizes != "auto":
-        cp = cp_sizes
+        params.cp = cp_sizes
     if ep_sizes is not None and ep_sizes != "auto":
-        ep = ep_sizes
+        params.ep = ep_sizes
     if mbs_sizes is not None and mbs_sizes != "auto":
-        mbs = mbs_sizes
+        params.mbs = mbs_sizes
     if gbs_size is not None and gbs_size != "auto":
-        gbs = gbs_size
+        params.gbs = gbs_size
     if min_model_parallel_size is not None and min_model_parallel_size != "auto":
-        min_model_parallel = min_model_parallel_size
+        params.min_model_parallel = min_model_parallel_size
     if max_model_parallel_size is not None and max_model_parallel_size != "auto":
-        max_model_parallel = max_model_parallel_size
-    return params
+        params.max_model_parallel = max_model_parallel_size
+    return params
\ No newline at end of file
diff --git a/nemo/collections/llm/tools/auto_configurator/core/utils.py b/nemo/collections/llm/tools/auto_configurator/core/utils.py
index 6e2a9a8208d0..b7f7c0f1d8e0 100644
--- a/nemo/collections/llm/tools/auto_configurator/core/utils.py
+++ b/nemo/collections/llm/tools/auto_configurator/core/utils.py
@@ -12,8 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-"""Utility functions for the Auto Configurator tool."""
 import copy
+from dataclasses import dataclass
 from typing import List, Optional, Tuple
 
 from nemo.collections.llm.tools.auto_configurator import base_configs
@@ -27,6 +27,247 @@
 }
 
 
+@dataclass
+class ModelSizeParams:
+    """Calculates the parameters that affect model_size: hidden size, attention heads, KV channels, and FFN size. It also calculates the learning rate.
+
+    Args:
+        model_size_in_b (float): number of parameters in the desired model config, in billions.
+        seq_length (int): sequence length to be used during training.
+        vocab_size (int): size of the vocabulary to use for training.
+        model_name (str): name of the model to be trained, i.e. gpt3, t5, mt5...
+    
+    Raises:
+        ValueError: if the model size is larger than the max supported model size.
+        NotImplementedError: if the model name is not supported.
+    """
+
+    model_size_in_b: float
+    vocab_size: int
+    seq_length: int
+    model_name: str
+
+    # Model size params
+    layers: int = None
+    hs: int = None
+    att_h: int = None
+    ffn: int = None
+    kv: int = None
+    lr: float = None
+
+    def init_params(self):
+        model_name = self.model_name
+        model_size_in_b = self.model_size_in_b
+        if model_name in ["gpt3", "llama", "baichuan2", "chatglm", "qwen2", "mixtral", "mistral", "gemma"]:
+            self.ffn = 4 * self.hs
+            if model_size_in_b < 0.25:
+                self.hs, self.att_h, self.lr = 768, 12, 6e-4
+            elif model_size_in_b < 0.5:
+                self.hs, self.att_h, self.lr = 1024, 16, 3e-4
+            elif model_size_in_b < 1:
+                self.hs, self.att_h, self.lr = 1536, 16, 2.5e-4
+            elif model_size_in_b < 2:
+                self.hs, self.att_h, self.lr = 2048, 16, 2e-4
+            elif model_size_in_b < 3:
+                self.hs, self.att_h, self.lr = 2560, 32, 1.6e-4
+            elif model_size_in_b < 4.5:
+                self.hs, self.att_h, self.lr = 3072, 32, 1.4e-4
+            elif model_size_in_b < 8:
+                self.hs, self.att_h, self.lr = 4096, 32, 1.2e-4
+            elif model_size_in_b < 15:
+                self.hs, self.att_h, self.lr = 5120, 40, 1e-4
+            elif model_size_in_b < 25:
+                self.hs, self.att_h, self.lr = 6144, 48, 1e-4
+            elif model_size_in_b < 52:
+                self.hs, self.att_h, self.lr = 8192, 64, 0.8e-4
+            elif model_size_in_b < 105:
+                self.hs, self.att_h, self.lr = 10240, 80, 0.7e-4
+            elif model_size_in_b < 205:
+                self.hs, self.att_h, self.lr = 12288, 96, 0.6e-4
+            elif model_size_in_b < 405:
+                self.hs, self.att_h, self.lr = 20480, 128, 0.5e-4
+            elif model_size_in_b < 805:
+                self.hs, self.att_h, self.lr = 20480, 128, 0.4e-4
+            elif model_size_in_b < 1105:
+                self.hs, self.att_h, self.lr = 25600, 160, 0.3e-4
+            else:
+                raise ValueError("Model_size for GPT-3 must be smaller than 1.1T parameters.")
+        elif model_name == "t5":
+            self.kv, self.lr = 64, 1e-4
+            if model_size_in_b < 0.1:
+                self.hs, self.att_h, self.ffn = 512, 6, 1024
+            elif model_size_in_b < 0.4:
+                self.hs, self.att_h, self.ffn = 768, 12, 2048
+            elif model_size_in_b < 1:
+                self.hs, self.att_h, self.ffn = 1024, 16, 2816
+            elif model_size_in_b < 5:
+                self.hs, self.att_h, self.ffn = 2048, 32, 5120
+            elif model_size_in_b < 15:
+                self.hs, self.att_h, self.ffn = 4096, 64, 10240
+            elif model_size_in_b < 25.9:
+                self.hs, self.att_h, self.ffn = 5120, 80, 10880
+            elif model_size_in_b < 43.0:
+                self.hs, self.att_h, self.ffn = 6144, 96, 10880
+            elif model_size_in_b <= 85.5:
+                self.hs, self.att_h, self.ffn = 6144, 96, 16384
+            elif model_size_in_b <= 165.5:
+                self.hs, self.att_h, self.ffn, kv = 7680, 96, 20480, 128
+            elif model_size_in_b <= 250:
+                self.hs, self.att_h, self.ffn, kv = 12288, 96, 32768, 128
+            else:
+                raise ValueError("Model_size for T5 must be smaller than 250B parameters.")
+        elif model_name == "mt5":
+            self.kv, self.lr = 64, 1e-4
+            if model_size_in_b < 0.25:
+                self.hs, self.att_h, self.ffn = 512, 6, 1024
+            elif model_size_in_b < 0.5:
+                self.hs, self.att_h, self.ffn = 768, 12, 2048
+            elif model_size_in_b < 1.2:
+                self.hs, self.att_h, self.ffn = 1024, 16, 2816
+            elif model_size_in_b < 5:
+                self.hs, self.att_h, self.ffn = 2048, 32, 5120
+            elif model_size_in_b < 15:
+                self.hs, self.att_h, self.ffn = 4096, 64, 10240
+            elif model_size_in_b < 25.9:
+                self.hs, self.att_h, self.ffn = 5120, 80, 10880
+            elif model_size_in_b < 43.0:
+                self.hs, self.att_h, self.ffn = 6144, 96, 10880
+            elif model_size_in_b <= 85.5:
+                self.hs, self.att_h, self.ffn = 6144, 96, 16384
+            elif model_size_in_b <= 165.5:
+                self.hs, self.att_h, self.ffn, kv = 7680, 96, 20480, 128
+            elif model_size_in_b <= 250:
+                self.hs, self.att_h, self.ffn, kv = 12288, 96, 32768, 128
+            else:
+                raise ValueError("Model_size for mT5 must be smaller than 250B parameters.")
+        elif model_name == "bert":
+            self.lr = 1e-4
+            if model_size_in_b < 0.25:
+                self.hs, self.att_h, self.lr = 768, 12, 2e-4
+            elif model_size_in_b < 0.5:
+                self.hs, self.att_h, self.lr = 1024, 16, 2e-4
+            elif model_size_in_b < 1:
+                self.hs, self.att_h = 1536, 16
+            elif model_size_in_b < 2:
+                self.hs, self.att_h = 2048, 16
+            elif model_size_in_b < 3:
+                self.hs, self.att_h = 2560, 32
+            elif model_size_in_b < 4.5:
+                self.hs, self.att_h = 2560, 32
+            elif model_size_in_b < 8:
+                self.hs, self.att_h = 4096, 32
+            elif model_size_in_b < 15:
+                self.hs, self.att_h = 5120, 40
+            elif model_size_in_b <= 25:
+                self.hs, self.att_h = 6144, 48
+            elif model_size_in_b <= 46.5:
+                self.hs, self.att_h = 7680, 48
+            elif model_size_in_b <= 87.5:
+                self.hs, self.att_h = 9216, 96
+            elif model_size_in_b <= 165.5:
+                self.hs, self.att_h = 9216, 96
+            elif model_size_in_b <= 250.5:
+                self.hs, self.att_h = 12288, 96
+            else:
+                raise ValueError("Model_size for BERT must be smaller than 25B parameters.")
+            self.ffn = 4 * self.hs
+        else:
+            raise NotImplementedError("Model name is not valid.")
+
+        # Try powers of 2
+        margin = 0.01
+        for attempt in range(0, 10):
+            for layers in (2**p for p in range(1, 10)):
+                out_size = _calculate_model_size(
+                    vocab_size=vocab_size,
+                    seq_length=seq_length,
+                    hidden_size=hs,
+                    num_layers=layers,
+                    ffn_size=ffn,
+                    kv_channels=kv,
+                    att_heads=att_h,
+                    model_name=model_name,
+                )
+                if model_size_in_b * (1.0 - margin) < out_size < model_size_in_b * (1.0 + margin) and not self.layers:
+                    self.layers = layers
+            margin += 0.01  # Double margin of acceptable model sizes.
+
+        # Try multiples of 16
+        margin = 0.01
+        for attempt in range(0, 6):
+            for layers in range(16, 201, 16):
+                out_size = _calculate_model_size(
+                    vocab_size=vocab_size,
+                    seq_length=seq_length,
+                    hidden_size=hs,
+                    num_layers=layers,
+                    ffn_size=ffn,
+                    kv_channels=kv,
+                    att_heads=att_h,
+                    model_name=model_name,
+                )
+                if model_size_in_b * (1.0 - margin) < out_size < model_size_in_b * (1.0 + margin) and not self.layers:
+                    self.layers = layers
+            margin += 0.01  # Double margin of acceptable model sizes.
+
+        # Try multiples of 2
+        margin = 0.01
+        for attempt in range(0, 6):
+            for layers in range(2, 201, 2):
+                out_size = _calculate_model_size(
+                    vocab_size=vocab_size,
+                    seq_length=seq_length,
+                    hidden_size=hs,
+                    num_layers=layers,
+                    ffn_size=ffn,
+                    kv_channels=kv,
+                    att_heads=att_h,
+                    model_name=model_name,
+                )
+                if model_size_in_b * (1.0 - margin) < out_size < model_size_in_b * (1.0 + margin) and not self.layers:
+                    self.layers = layers
+            margin += 0.01  # Double margin of acceptable model sizes.
+
+        # Try multiples of 5
+        margin = 0.01
+        for attempt in range(0, 6):
+            for layers in range(5, 201, 5):
+                out_size = _calculate_model_size(
+                    vocab_size=vocab_size,
+                    seq_length=seq_length,
+                    hidden_size=hs,
+                    num_layers=layers,
+                    ffn_size=ffn,
+                    kv_channels=kv,
+                    att_heads=att_h,
+                    model_name=model_name,
+                )
+                if model_size_in_b * (1.0 - margin) < out_size < model_size_in_b * (1.0 + margin) and not self.layers:
+                    self.layers = layers
+            margin += 0.01  # Double margin of acceptable model sizes.
+
+        # Try any valid number
+        margin = 0.01
+        for attempt in range(0, 10):
+            for layers in range(1, 200):
+                out_size = _calculate_model_size(
+                    vocab_size=vocab_size,
+                    seq_length=seq_length,
+                    hidden_size=hs,
+                    num_layers=layers,
+                    ffn_size=ffn,
+                    kv_channels=kv,
+                    att_heads=att_h,
+                    model_name=model_name,
+                )
+                if model_size_in_b * (1.0 - margin) < out_size < model_size_in_b * (1.0 + margin) and not self.layers:
+                    self.layers = layers
+            margin += 0.01  # Double margin of acceptable model sizes.
+        
+        if not self.layers:
+            raise Exception("Number of layers not found, config is not possible.")
+
+
 def _calculate_model_size(
     vocab_size: int = None,
     seq_length: int = None,
@@ -37,21 +278,25 @@ def _calculate_model_size(
     att_heads: int = None,
     model_name: str = "gpt3",
 ):
+    """Calculates the model size (number of parameters in billions), given the model parameters and name.
+
+    Args:
+        vocab_size (int): vocabulary size to be used during training.
+        seq_length (int): input sequence length to be used during training.
+        hidden_size (int): size of the hidden layers of the model.
+        num_layers (int): number of layers in the model.
+        ffn_size (int): FFN size of the model.
+        kv_channels (int): number of KV channels in the transformer layers.
+        att_heads (int): number of attention heads in the transformer layers.
+        model_name (str): name of the model, i.e gpt3, t5, mt5...
+    
+    Returns:
+        float: size of the model in billions of parameters.
+    
+    Raises:
+        NotImplementedError: if the model name is not valid.
     """
-    Calculates the model size (number of parameters in billions), given the model parameters
-    and name.
-    :param int vocab_size: vocabulary size to be used during training.
-    :param int seq_length: input sequence length to be used during training.
-    :param int hidden_size: size of the hidden layers of the model.
-    :param int num_layers: number of layers in the model.
-    :param int ffn_size: FFN size of the model.
-    :param int kv_channels: number of KV channels in the transformer layers.
-    :param int att_heads: number of attention heads in the transformer layers.
-    :param str model_name: name of the model, i.e gpt3, t5, mt5...
-    :return: size of the model in billions of parameters.
-    :rtype: float
-    :raises NotImplementedError: if the model name is not valid.
-    """
+
     if model_name in ["gpt3", "llama", "baichuan2", "chatglm", "qwen2", "mixtral", "mistral", "gemma"]:
         model_size = (
             12
@@ -81,238 +326,6 @@ def _calculate_model_size(
     return model_size
 
 
-def calculate_model_size_params(
-    model_size_in_b: float,
-    vocab_size: int = 51200,
-    seq_length: int = 2048,
-    model_name: str = "gpt3",
-) -> Tuple[int, int, float]:
-    """
-    Calculates the parameters that affect model_size: hidden size, attention heads,
-    KV channels, and FFN size. It also calculates the learning rate.
-    :param float model_size_in_b: float, number of parameters in the desired model config, in billions.
-    :param int seq_length: int, sequence length to be used during training.
-    :param int vocab_size: int, size of the vocabulary to use for training.
-    :param str model_name: str, name of the model to be trained, i.e. gpt3, t5, mt5...
-    :returns: tuple (layers, hs, att_h, ffn, kv, lr)
-        WHERE
-        int layers is the number of layers in the model.
-        int hs is the hidden size of the model.
-        int att_h is the number of attention heads in the model.
-        int ffn is the FFN hidden size of the model.
-        int kv is the number of KV channels in the model.
-        float lr is the learning rate used to train the model.
-    :raises ValueError: if the model size is larger than the max supported model size.
-    :raises NotImplementedError: if the model name is not supported.
-    """
-    ffn, kv = None, None  # Only needed for some models.
-    if model_name in ["gpt3", "llama", "baichuan2", "chatglm", "qwen2", "mixtral", "mistral", "gemma"]:
-        if model_size_in_b < 0.25:
-            hs, att_h, lr = 768, 12, 6e-4
-        elif model_size_in_b < 0.5:
-            hs, att_h, lr = 1024, 16, 3e-4
-        elif model_size_in_b < 1:
-            hs, att_h, lr = 1536, 16, 2.5e-4
-        elif model_size_in_b < 2:
-            hs, att_h, lr = 2048, 16, 2e-4
-        elif model_size_in_b < 3:
-            hs, att_h, lr = 2560, 32, 1.6e-4
-        elif model_size_in_b < 4.5:
-            hs, att_h, lr = 3072, 32, 1.4e-4
-        elif model_size_in_b < 8:
-            hs, att_h, lr = 4096, 32, 1.2e-4
-        elif model_size_in_b < 15:
-            hs, att_h, lr = 5120, 40, 1e-4
-        elif model_size_in_b < 25:
-            hs, att_h, lr = 6144, 48, 1e-4
-        elif model_size_in_b < 52:
-            hs, att_h, lr = 8192, 64, 0.8e-4
-        elif model_size_in_b < 105:
-            hs, att_h, lr = 10240, 80, 0.7e-4
-        elif model_size_in_b < 205:
-            hs, att_h, lr = 12288, 96, 0.6e-4
-        elif model_size_in_b < 405:
-            hs, att_h, lr = 20480, 128, 0.5e-4
-        elif model_size_in_b < 805:
-            hs, att_h, lr = 20480, 128, 0.4e-4
-        elif model_size_in_b < 1105:
-            hs, att_h, lr = 25600, 160, 0.3e-4
-        else:
-            raise ValueError("Model_size for GPT-3 must be smaller than 1.1T parameters.")
-    elif model_name == "t5":
-        kv, lr = 64, 1e-4
-        if model_size_in_b < 0.1:
-            hs, att_h, ffn = 512, 6, 1024
-        elif model_size_in_b < 0.4:
-            hs, att_h, ffn = 768, 12, 2048
-        elif model_size_in_b < 1:
-            hs, att_h, ffn = 1024, 16, 2816
-        elif model_size_in_b < 5:
-            hs, att_h, ffn = 2048, 32, 5120
-        elif model_size_in_b < 15:
-            hs, att_h, ffn = 4096, 64, 10240
-        elif model_size_in_b < 25.9:
-            hs, att_h, ffn = 5120, 80, 10880
-        elif model_size_in_b < 43.0:
-            hs, att_h, ffn = 6144, 96, 10880
-        elif model_size_in_b <= 85.5:
-            hs, att_h, ffn = 6144, 96, 16384
-        elif model_size_in_b <= 165.5:
-            hs, att_h, ffn, kv = 7680, 96, 20480, 128
-        elif model_size_in_b <= 250:
-            hs, att_h, ffn, kv = 12288, 96, 32768, 128
-        else:
-            raise ValueError("Model_size for T5 must be smaller than 250B parameters.")
-    elif model_name == "mt5":
-        kv, lr = 64, 1e-4
-        if model_size_in_b < 0.25:
-            hs, att_h, ffn = 512, 6, 1024
-        elif model_size_in_b < 0.5:
-            hs, att_h, ffn = 768, 12, 2048
-        elif model_size_in_b < 1.2:
-            hs, att_h, ffn = 1024, 16, 2816
-        elif model_size_in_b < 5:
-            hs, att_h, ffn = 2048, 32, 5120
-        elif model_size_in_b < 15:
-            hs, att_h, ffn = 4096, 64, 10240
-        elif model_size_in_b < 25.9:
-            hs, att_h, ffn = 5120, 80, 10880
-        elif model_size_in_b < 43.0:
-            hs, att_h, ffn = 6144, 96, 10880
-        elif model_size_in_b <= 85.5:
-            hs, att_h, ffn = 6144, 96, 16384
-        elif model_size_in_b <= 165.5:
-            hs, att_h, ffn, kv = 7680, 96, 20480, 128
-        elif model_size_in_b <= 250:
-            hs, att_h, ffn, kv = 12288, 96, 32768, 128
-        else:
-            raise ValueError("Model_size for mT5 must be smaller than 250B parameters.")
-    elif model_name == "bert":
-        lr = 1e-4
-        if model_size_in_b < 0.25:
-            hs, att_h, lr = 768, 12, 2e-4
-        elif model_size_in_b < 0.5:
-            hs, att_h, lr = 1024, 16, 2e-4
-        elif model_size_in_b < 1:
-            hs, att_h = 1536, 16
-        elif model_size_in_b < 2:
-            hs, att_h = 2048, 16
-        elif model_size_in_b < 3:
-            hs, att_h = 2560, 32
-        elif model_size_in_b < 4.5:
-            hs, att_h = 2560, 32
-        elif model_size_in_b < 8:
-            hs, att_h = 4096, 32
-        elif model_size_in_b < 15:
-            hs, att_h = 5120, 40
-        elif model_size_in_b <= 25:
-            hs, att_h = 6144, 48
-        elif model_size_in_b <= 46.5:
-            hs, att_h = 7680, 48
-        elif model_size_in_b <= 87.5:
-            hs, att_h = 9216, 96
-        elif model_size_in_b <= 165.5:
-            hs, att_h = 9216, 96
-        elif model_size_in_b <= 250.5:
-            hs, att_h = 12288, 96
-        else:
-            raise ValueError("Model_size for BERT must be smaller than 25B parameters.")
-        ffn = 4 * hs
-    else:
-        raise NotImplementedError("Model name is not valid.")
-
-    # Try powers of 2
-    margin = 0.01
-    for attempt in range(0, 10):
-        for layers in (2**p for p in range(1, 10)):
-            out_size = _calculate_model_size(
-                vocab_size=vocab_size,
-                seq_length=seq_length,
-                hidden_size=hs,
-                num_layers=layers,
-                ffn_size=ffn,
-                kv_channels=kv,
-                att_heads=att_h,
-                model_name=model_name,
-            )
-            if model_size_in_b * (1.0 - margin) < out_size < model_size_in_b * (1.0 + margin):
-                return layers, hs, att_h, ffn, kv, lr
-        margin += 0.01  # Double margin of acceptable model sizes.
-
-    # Try multiples of 16
-    margin = 0.01
-    for attempt in range(0, 6):
-        for layers in range(16, 201, 16):
-            out_size = _calculate_model_size(
-                vocab_size=vocab_size,
-                seq_length=seq_length,
-                hidden_size=hs,
-                num_layers=layers,
-                ffn_size=ffn,
-                kv_channels=kv,
-                att_heads=att_h,
-                model_name=model_name,
-            )
-            if model_size_in_b * (1.0 - margin) < out_size < model_size_in_b * (1.0 + margin):
-                return layers, hs, att_h, ffn, kv, lr
-        margin += 0.01  # Double margin of acceptable model sizes.
-
-    # Try multiples of 2
-    margin = 0.01
-    for attempt in range(0, 6):
-        for layers in range(2, 201, 2):
-            out_size = _calculate_model_size(
-                vocab_size=vocab_size,
-                seq_length=seq_length,
-                hidden_size=hs,
-                num_layers=layers,
-                ffn_size=ffn,
-                kv_channels=kv,
-                att_heads=att_h,
-                model_name=model_name,
-            )
-            if model_size_in_b * (1.0 - margin) < out_size < model_size_in_b * (1.0 + margin):
-                return layers, hs, att_h, ffn, kv, lr
-        margin += 0.01  # Double margin of acceptable model sizes.
-
-    # Try multiples of 5
-    margin = 0.01
-    for attempt in range(0, 6):
-        for layers in range(5, 201, 5):
-            out_size = _calculate_model_size(
-                vocab_size=vocab_size,
-                seq_length=seq_length,
-                hidden_size=hs,
-                num_layers=layers,
-                ffn_size=ffn,
-                kv_channels=kv,
-                att_heads=att_h,
-                model_name=model_name,
-            )
-            if model_size_in_b * (1.0 - margin) < out_size < model_size_in_b * (1.0 + margin):
-                return layers, hs, att_h, ffn, kv, lr
-        margin += 0.01  # Double margin of acceptable model sizes.
-
-    # Try any valid number
-    margin = 0.01
-    for attempt in range(0, 10):
-        for layers in range(1, 200):
-            out_size = _calculate_model_size(
-                vocab_size=vocab_size,
-                seq_length=seq_length,
-                hidden_size=hs,
-                num_layers=layers,
-                ffn_size=ffn,
-                kv_channels=kv,
-                att_heads=att_h,
-                model_name=model_name,
-            )
-            if model_size_in_b * (1.0 - margin) < out_size < model_size_in_b * (1.0 + margin):
-                return layers, hs, att_h, ffn, kv, lr
-        margin += 0.01  # Double margin of acceptable model sizes.
-    raise Exception("Number of layers not found, config is not possible.")
-
-
 def generic_base_config(
     model_name: str = "llama",
     model_version: int = 2,
@@ -320,13 +333,19 @@ def generic_base_config(
     model_measure: str = "B",
     cfg: dict = {},
 ) -> dict:
+    """Generates a base config dictionary from a base config python file.
+
+    Args:
+        model_name (str): name of the model, i.e. gpt3, t5, mt5...
+        model_version (int): version of model.
+        model_size_in_b (int): model size.
+        model_measure (str): model measure. Billions if "B", millions if "M".
+        cfg (dict): dict config object for the Auto Configurator tool.
+    
+    Returns:
+        dict: dictionary containing the base configuration for the model.
     """
-    Generates a base config dictionary from a base config python file.
-    :param dict cfg: dict config object for the Auto Configurator tool.
-    :param str model_name: name of the model, i.e. gpt3, t5, mt5...
-    :returns: dictionary containing the base configuration for the model.
-    :rtype: dict
-    """
+
     from nemo.collections.llm.tools.auto_configurator.core.base_config import calculate_model_size
 
     default_model = False if model_size_in_b else True
@@ -359,22 +378,22 @@ def generic_base_config(
     }
 
     if default_model:
-        num_layers, hidden_size, num_attention_heads, ffn_hidden_size, kv_channels, _ = calculate_model_size_params(
+        params = ModelSizeParams(
             model_size_in_b,
             cfg.get("vocab_size"),
             cfg.get("seq_length"),
             model_name,
-        )
+        ).init_params()
 
         if model_name in ["gpt3", "llama", "baichuan2", "chatglm", "qwen2", "mixtral", "mistral", "gemma"]:
-            base_cfg["model"].num_layers = num_layers
-            base_cfg["model"].hidden_size = hidden_size
-            base_cfg["model"].num_attention_heads = num_attention_heads
-            base_cfg["model"].kv_channels = kv_channels if kv_channels else None
-            if not ffn_hidden_size:
-                base_cfg["model"].ffn_hidden_size = hidden_size * 4
+            base_cfg["model"].num_layers = params.layers
+            base_cfg["model"].hidden_size = params.hs
+            base_cfg["model"].num_attention_heads = params.att_h
+            base_cfg["model"].kv_channels = params.kv
+            if not params.ffn:
+                base_cfg["model"].ffn_hidden_size = params.hs * 4
             else:
-                base_cfg["model"].ffn_hidden_size = ffn_hidden_size
+                base_cfg["model"].ffn_hidden_size = params.ffn
 
     cfg["model_size_in_b"] = model_size_in_b
 
@@ -397,25 +416,28 @@ def modify_cfg(
     num_nodes: int,
     model_name: str,
 ) -> dict:
+    """Modify the base configuration for the model with the new parameters that are specific to the current model, which the Auto Configurator tool heuristics selected.
+
+    Args:
+        base_cfg (dict): base configuration for the current model, which will be modified in this function.
+        act (int): number of activation checkpointing layers to use for the model.
+        num_mbs_act (int): sets the number of micro-batches where only a partial number of Transformer layers get checkpointed and recomputed within a window of micro-batches.
+        act_per_pipe (int): sets the number of Transformer layers to skip checkpointing at later pipeline stages.
+        tp (int): Tensor Parallelism (TP) value to be set for the model.
+        pp (int): Pipeline Parallelism (PP) value to be set for the model.
+        cp (int): Context Parallelism (CP) value to be set for the model.
+        ep (int): Expert Parallelism (EP) value to be set for the model.
+        virtual_pipelines (int): Virtual Pipelines value to be set for the model.
+        mbs (int): Micro Batch Size (MBS) value to be set for the model.
+        max_minutes (int): maximum amount of time to run this model for.
+        max_steps (int): maximum number of steps to run this model for.
+        num_nodes (int): number of nodes to use for the training run.
+        model_name (str): name of the model, i.e. gpt3, t5, mt5...
+
+    Returns:
+        dict: dictionary containing the updated model configuration parameters.
     """
-    Modify the base configuration for the model with the new parameters that are specific to the current model, which the Auto Configurator tool heuristics selected.
-    :param dict base_cfg: base configuration for the current model, which will be modified in this function.
-    :param int act: number of activation checkpointing layers to use for the model.
-    :param int num_mbs_act: sets the number of micro-batches where only a partial number of Transformer layers get checkpointed and recomputed within a window of micro-batches.
-    :param int act_per_pipe: sets the number of Transformer layers to skip checkpointing at later pipeline stages.
-    :param int tp: Tensor Parallelism (TP) value to be set for the model.
-    :param int pp: Pipeline Parallelism (PP) value to be set for the model.
-    :param int cp: Context Parallelism (CP) value to be set for the model.
-    :param int ep: Expert Parallelism (EP) value to be set for the model.
-    :param int virtual_pipelines: Virtual Pipelines value to be set for the model.
-    :param int mbs: Micro Batch Size (MBS) value to be set for the model.
-    :param int max_minutes: maximum amount of time to run this model for.
-    :param int max_steps: maximum number of steps to run this model for.
-    :param int num_nodes: number of nodes to use for the training run.
-    :param str model_name: name of the model, i.e. gpt3, t5, mt5...
-    :return: dictionary containing the updated model configuration parameters.
-    :rtype: dict
-    """
+    
     new_cfg = copy.deepcopy(base_cfg)
     if act is not None:
         if model_name in [
diff --git a/nemo/collections/llm/tools/auto_configurator/runner.py b/nemo/collections/llm/tools/auto_configurator/runner.py
index 072f23d0d285..0c093b65f2ae 100644
--- a/nemo/collections/llm/tools/auto_configurator/runner.py
+++ b/nemo/collections/llm/tools/auto_configurator/runner.py
@@ -78,35 +78,36 @@ def __init__(
         nemo_run: Optional[bool] = False,
     ):
         """
-        :param str model_type: model type to be used for training.
-        :param int num_nodes: number of nodes to be used for training.
-        :param List data_paths: list of datafiles to be used for training.
-        :param str path_to_logs: path to the directory where the logs will be stored.
-        :param Optional[str] tokenizer_type: tokenizer type.
-        :param Optional[str] tokenizer_path: path to the tokenizer model.
-        :param Optional[int] model_size: size of model to be trained.
-        :param Optional[int] model_version: version of model. 3 for GPT3, 2 for Llama2.
-        :param Optional[int] gpus_per_node: number of GPUs per node to be used.
-        :param Optional[int] gpu_memory_gb: memory per GPU, in GB. Currently 40GB and 80GB A100s/H100s supported.
-        :param Optional[str] model_measure: "M" if model_size is specified in millions. "B" if in billions.
-        :param Optional[int] seq_length: model sequence length. Available seq_length list for GPT-based models: [2048, 4096, 8192, 16384, 32768].
-        :param Optional[int] global_batch_size: model global batch size. Set to "auto" if you want auto configurator to find optimal gbs.
-        :param Optional[List[int]] tensor_parallel_sizes: set to "auto" to use our recommendation, or a list, such as [1, 2, 4, 8].
-        :param Optional[List[int]] pipeline_parallel_sizes: set to "auto" to use our recommendation, or a list, such as [1, 2, 4, 8].
-        :param Optional[List[int]] micro_batch_sizes: set to "auto" to use our recommendation, or a list, such as [1, 2, 4, 8].
-        :param Optional[List[int]] context_parallel_sizes: model context parallel size. A list, such as [1, 2, 4, 8].
-        :param Optional[List[int]] expert_parallel_sizes: model expert parallel size. A list, such as [1, 2, 4, 8].
-        :param Optional[int] min_model_parallel_size: set to "auto" to use our recommendation, or a value for the minimum desired parallelism.
-        :param Optional[int] max_model_parallel_size: set to "auto" to use our recommendation, or a value for the maximum desired parallelism.
-        :param Optional[int] num_tokens_in_b: number of tokens in billions in train dataset.
-        :param Optional[int] tflops_per_gpu: estimated tflops per GPU.
-        :param Optional[int] max_minutes_per_run: maximum number of minutes per run for the grid search.
-        :param Optional[int] max_training_days: number of days expected model to be trained.
-        :param Optional[int] max_steps_per_run: maximum number of steps per run for the grid search.
-        :param Optional[int] vocab_size: size of tokenizer vocabulary.
-        :param Optional[dict] model_args: additional args to add to mdoel config.
-        :param Optional[bool] custom_model: set to True if you want to use custom model.
-        :param Optional[bool] nemo_sdk: set to True if you want to run Auto Configurator with nemo-sdk.
+        Args:
+            model_type (str): model type to be used for training.
+            num_nodes (int): number of nodes to be used for training.
+            data_paths (List): list of datafiles to be used for training.
+            path_to_logs (str): path to the directory where the logs will be stored.
+            tokenizer_type (Optional[str]): tokenizer type.
+            tokenizer_path (Optional[str]): path to the tokenizer model.
+            model_size (Optional[int]): size of model to be trained.
+            model_version (Optional[int]): version of model. 3 for GPT3, 2 for Llama2.
+            gpus_per_node (Optional[int]): number of GPUs per node to be used.
+            gpu_memory_gb (Optional[int]): memory per GPU, in GB. Currently 40GB and 80GB A100s/H100s supported.
+            model_measure (Optional[str]): "M" if model_size is specified in millions. "B" if in billions.
+            seq_length (Optional[int]): model sequence length. Available seq_length list for GPT-based models: [2048, 4096, 8192, 16384, 32768].
+            global_batch_size (Optional[int]): model global batch size. Set to "auto" if you want auto configurator to find optimal gbs.
+            tensor_parallel_sizes (Optional[List[int]]): set to "auto" to use our recommendation, or a list, such as [1, 2, 4, 8].
+            pipeline_parallel_sizes (Optional[List[int]]): set to "auto" to use our recommendation, or a list, such as [1, 2, 4, 8].
+            micro_batch_sizes (Optional[List[int]]): set to "auto" to use our recommendation, or a list, such as [1, 2, 4, 8].
+            context_parallel_sizes (Optional[List[int]]): model context parallel size. A list, such as [1, 2, 4, 8].
+            expert_parallel_sizes (Optional[List[int]]): model expert parallel size. A list, such as [1, 2, 4, 8].
+            min_model_parallel_size (Optional[int]): set to "auto" to use our recommendation, or a value for the minimum desired parallelism.
+            max_model_parallel_size (Optional[int]): set to "auto" to use our recommendation, or a value for the maximum desired parallelism.
+            num_tokens_in_b (Optional[int]): number of tokens in billions in train dataset.
+            tflops_per_gpu (Optional[int]): estimated tflops per GPU.
+            max_minutes_per_run (Optional[int]): maximum number of minutes per run for the grid search.
+            max_training_days (Optional[int]): number of days expected model to be trained.
+            max_steps_per_run (Optional[int]): maximum number of steps per run for the grid search.
+            vocab_size (Optional[int]): size of tokenizer vocabulary.
+            model_args (Optional[dict]): additional args to add to mdoel config.
+            custom_model (Optional[bool]): set to True if you want to use custom model.
+            nemo_sdk (Optional[bool]): set to True if you want to run Auto Configurator with nemo-sdk.
         """
 
         assert model_type in SUPPORTED_MODELS, f"model_type must be set to one of {SUPPORTED_MODELS}."
@@ -150,10 +151,10 @@ def _generate_nemo_run_configs(
     ) -> dict:
         """
         Function that returns a dictionary of Partial configs.
-        :param: dict config: runner config.
-        :param: str tokenizer_type: tokenizer type.
-        :param: str tokenizer_path: path to the tokenizer.
-        :param: str path_to_logs: path to logs directory.
+        : dict config: runner config.
+        : str tokenizer_type: tokenizer type.
+        : str tokenizer_path: path to the tokenizer.
+        : str path_to_logs: path to logs directory.
         :return: dictionary of Partial configs.
         :rtype: dict.
         """
@@ -179,8 +180,8 @@ def _get_model(self, model_config, tokenizer):
     def _get_tokenizer(self, tokenizer_type: str, tokenizer_path: str) -> Config:
         """
         Function that returns the tokenizer config.
-        :param: str tokenizer_type: tokenizer type.
-        :param: str tokenizer_path: path to the tokenizer.
+        : str tokenizer_type: tokenizer type.
+        : str tokenizer_path: path to the tokenizer.
         :return: tokenizer config.
         :rtype: Config.
         """
@@ -193,7 +194,7 @@ def _get_tokenizer(self, tokenizer_type: str, tokenizer_path: str) -> Config:
     def _get_data(self, data_config: dict, tokenizer_config: Config) -> Config:
         """
         Function that returns the data module.
-        :param: Config tokenizer: tokenizer config.
+        : Config tokenizer: tokenizer config.
         :return: data module.
         :rtype: Config.
         """
@@ -207,7 +208,7 @@ def _get_data(self, data_config: dict, tokenizer_config: Config) -> Config:
     def _get_optim(self, optim_config: Config) -> Config:
         """
         Function that returns the optimizer.
-        :param: Config optim_config: optimizer config.
+        : Config optim_config: optimizer config.
         :return: optimizer.
         :rtype: Config.
         """
@@ -228,8 +229,8 @@ def _get_optim(self, optim_config: Config) -> Config:
     def _get_trainer(self, trainer_config: dict, strategy: Config) -> Config:
         """
         Function that returns the trainer.
-        :param: dict trainer_config: trainer config.
-        :param: Config strategy: training strategy.
+        : dict trainer_config: trainer config.
+        : Config strategy: training strategy.
         :return: trainer.
         :rtype: Config.
         """
@@ -245,7 +246,7 @@ def _get_trainer(self, trainer_config: dict, strategy: Config) -> Config:
     def _get_startegy(self, auto_config: dict) -> Config:
         """
         Function that returns the training strategy.
-        :param: dict auto_config: model parallelism config.
+        : dict auto_config: model parallelism config.
         :return: training strategy.
         :rtype: Config.
         """
@@ -263,8 +264,8 @@ def _get_startegy(self, auto_config: dict) -> Config:
     def _get_logger(self, run_name: str, path_to_logs: str) -> Config:
         """
         Function that returns the training strategy.
-        :param: str run_name: name of run.
-        :param: str path_to_logs: path to logs directory.
+        : str run_name: name of run.
+        : str path_to_logs: path to logs directory.
         :return: training logger.
         :rtype: Config.
         """
@@ -291,7 +292,7 @@ def _get_logger(self, run_name: str, path_to_logs: str) -> Config:
     def _get_message(self, config: dict) -> str:
         """
         Function that returns runner config line by line.
-        :param: dict config: runner config.
+        : dict config: runner config.
         :return: runner config params.
         :rtype: str.
         """

From bda010095f34a27b60ceb0666adcc0ae5bd0f255 Mon Sep 17 00:00:00 2001
From: dimapihtar <dimapihtar@users.noreply.github.com>
Date: Wed, 28 Aug 2024 12:18:49 +0000
Subject: [PATCH 19/63] Apply isort and black reformatting

Signed-off-by: dimapihtar <dimapihtar@users.noreply.github.com>
---
 .../tools/auto_configurator/core/base_config.py  | 16 ++++++++--------
 .../core/calculate_performance.py                | 16 ++++++++--------
 .../auto_configurator/core/training_config.py    |  4 ++--
 .../llm/tools/auto_configurator/core/utils.py    | 12 ++++++------
 4 files changed, 24 insertions(+), 24 deletions(-)

diff --git a/nemo/collections/llm/tools/auto_configurator/core/base_config.py b/nemo/collections/llm/tools/auto_configurator/core/base_config.py
index 5d172a6df29b..c47bc6b894a7 100644
--- a/nemo/collections/llm/tools/auto_configurator/core/base_config.py
+++ b/nemo/collections/llm/tools/auto_configurator/core/base_config.py
@@ -30,17 +30,17 @@ def calculate_model_size(
     """Estimates a model size to be trained given the constraints. If the
        model_size is provided, it estimates the time to train it with the given
        constraints.
-    
+
     Example:
         output 5B params to train for 7 days with 160 GPUs.
-    
+
     Args:
         gpu_count (int): number of gpus to use (num_nodes * gpus_per_node).
         max_training_days (float): number of days to train the model for.
         model_size_in_b (float): number of parameters in the model, if known.
         tflops_per_gpu (int): estimated number of TFLOPS/s per GPU.
         num_tokens_in_b (int): number of tokens to train the model for.
-    
+
     Returns:
         float: number of parameters to use for training.
     """
@@ -88,10 +88,10 @@ def _estimate_model_size(
         tflops_per_gpu (int): estimated number of TFLOPS/s per GPU.
         num_tokens_in_b (int): number of tokens to train the model for.
         model_name (str): name of the model, such as gpt3, t5, mt5...
-    
+
     Returns:
         float: number of parameters to use for training.
-    
+
     Raises:
         NotImplementedError: if the model_name is not one of the supported models.
     """
@@ -136,11 +136,11 @@ def _estimate_training_time(
 
     Returns:
         float: number of days it will take to train the model.
-    
+
     Raises:
         NotImplementedError: if the model_name is not one of the supported models.
     """
-    
+
     model_penalty = 1.15 if model_name == "mt5" else 1.0
     valid_models = ["gpt3", "t5", "mt5", "bert", "llama", "mixtral", "mistral", "gemma"]
     try:
@@ -159,4 +159,4 @@ def _estimate_training_time(
         print(f"Cannot divide by zero. This can happen if gpu_count or tflops_per_gpu are zero: {err}")
     except NotImplementedError as err:
         print(f"Training time estimation is only available for {valid_models}: {err}")
-    return None
\ No newline at end of file
+    return None
diff --git a/nemo/collections/llm/tools/auto_configurator/core/calculate_performance.py b/nemo/collections/llm/tools/auto_configurator/core/calculate_performance.py
index 32f392a2f8da..a3aeabe3697e 100644
--- a/nemo/collections/llm/tools/auto_configurator/core/calculate_performance.py
+++ b/nemo/collections/llm/tools/auto_configurator/core/calculate_performance.py
@@ -61,7 +61,7 @@ def get_results(
         custom_model (Optional[bool]): set to True if custom model was used.
         output_top_n (Optional[int]): Number of configs to be printed out as best configs.
     """
-    
+
     # Get model architecture
     cfg = locals()
     cfg["gpu_count"] = num_nodes * gpus_per_node
@@ -254,12 +254,12 @@ def calculate_tflops(
 ):
     """Calculates model and hardware TFLOPS for each model.
 
-        GPT-3 Formulas:
-            Model FLOPs = (24𝐵𝑠ℎ^2 + 4𝐵��^2ℎ) x (3 x num_layers) + 6𝐵𝑠ℎ
-        T5/mT5 Formula:
-            Model FLOPs =
-        Bert Formula:
-            Model FLOPs = 72BLsh^2 * ( 1 + (s/6h) + (v/12hL))
+    GPT-3 Formulas:
+        Model FLOPs = (24𝐵𝑠ℎ^2 + 4𝐵��^2ℎ) x (3 x num_layers) + 6𝐵𝑠ℎ
+    T5/mT5 Formula:
+        Model FLOPs =
+    Bert Formula:
+        Model FLOPs = 72BLsh^2 * ( 1 + (s/6h) + (v/12hL))
     """
 
     if model_name in ["gpt3", "llama", "baichuan2", "chatglm", "qwen2", "mixtral"]:
@@ -320,7 +320,7 @@ def find_error(error_file: str, errors: list = ["CUDA out of memory"]):
     Args:
         :param list errors: list of "popular" errors.
         :param str error_file: path to the job output.
-    
+
     Returns:
         str: serror message if job has been failed because of one of listed errors or None if not.
     """
diff --git a/nemo/collections/llm/tools/auto_configurator/core/training_config.py b/nemo/collections/llm/tools/auto_configurator/core/training_config.py
index 376d96c1012a..cc4f9f6d9f7b 100644
--- a/nemo/collections/llm/tools/auto_configurator/core/training_config.py
+++ b/nemo/collections/llm/tools/auto_configurator/core/training_config.py
@@ -866,7 +866,7 @@ def _calculate_tp_pp_mbs_grid(
         model_name (str): name of the model to be used, such as gpt3, t5, mt5...
         seq_length (int): sequence length to use for training.
         train_cfg (dict): config of the model that will be launched.
-    
+
     Returns:
         dataclass object with model parallelism parameters.
 
@@ -933,4 +933,4 @@ def _calculate_tp_pp_mbs_grid(
         params.min_model_parallel = min_model_parallel_size
     if max_model_parallel_size is not None and max_model_parallel_size != "auto":
         params.max_model_parallel = max_model_parallel_size
-    return params
\ No newline at end of file
+    return params
diff --git a/nemo/collections/llm/tools/auto_configurator/core/utils.py b/nemo/collections/llm/tools/auto_configurator/core/utils.py
index b7f7c0f1d8e0..5dddb64cbcbb 100644
--- a/nemo/collections/llm/tools/auto_configurator/core/utils.py
+++ b/nemo/collections/llm/tools/auto_configurator/core/utils.py
@@ -36,7 +36,7 @@ class ModelSizeParams:
         seq_length (int): sequence length to be used during training.
         vocab_size (int): size of the vocabulary to use for training.
         model_name (str): name of the model to be trained, i.e. gpt3, t5, mt5...
-    
+
     Raises:
         ValueError: if the model size is larger than the max supported model size.
         NotImplementedError: if the model name is not supported.
@@ -263,7 +263,7 @@ def init_params(self):
                 if model_size_in_b * (1.0 - margin) < out_size < model_size_in_b * (1.0 + margin) and not self.layers:
                     self.layers = layers
             margin += 0.01  # Double margin of acceptable model sizes.
-        
+
         if not self.layers:
             raise Exception("Number of layers not found, config is not possible.")
 
@@ -289,10 +289,10 @@ def _calculate_model_size(
         kv_channels (int): number of KV channels in the transformer layers.
         att_heads (int): number of attention heads in the transformer layers.
         model_name (str): name of the model, i.e gpt3, t5, mt5...
-    
+
     Returns:
         float: size of the model in billions of parameters.
-    
+
     Raises:
         NotImplementedError: if the model name is not valid.
     """
@@ -341,7 +341,7 @@ def generic_base_config(
         model_size_in_b (int): model size.
         model_measure (str): model measure. Billions if "B", millions if "M".
         cfg (dict): dict config object for the Auto Configurator tool.
-    
+
     Returns:
         dict: dictionary containing the base configuration for the model.
     """
@@ -437,7 +437,7 @@ def modify_cfg(
     Returns:
         dict: dictionary containing the updated model configuration parameters.
     """
-    
+
     new_cfg = copy.deepcopy(base_cfg)
     if act is not None:
         if model_name in [

From 6d5305ec0d39af0b1523fd6466f3bc49b3927abe Mon Sep 17 00:00:00 2001
From: dimapihtar <dpihtar@gmail.com>
Date: Wed, 28 Aug 2024 08:39:28 -0700
Subject: [PATCH 20/63] take Config object as input for model

Signed-off-by: dimapihtar <dpihtar@gmail.com>
---
 .../auto_configurator/base_configs/basic.py   | 29 +++-------
 .../auto_configurator/base_configs/gemma.py   | 29 +++-------
 .../auto_configurator/base_configs/gpt.py     | 25 ++------
 .../auto_configurator/base_configs/llama.py   | 30 +++-------
 .../auto_configurator/base_configs/mistral.py | 28 +++------
 .../auto_configurator/base_configs/mixtral.py | 28 +++------
 .../llm/tools/auto_configurator/core/utils.py |  4 +-
 .../llm/tools/auto_configurator/runner.py     | 57 ++++++++-----------
 8 files changed, 66 insertions(+), 164 deletions(-)

diff --git a/nemo/collections/llm/tools/auto_configurator/base_configs/basic.py b/nemo/collections/llm/tools/auto_configurator/base_configs/basic.py
index 4044c1f8bd10..33a210208ca6 100644
--- a/nemo/collections/llm/tools/auto_configurator/base_configs/basic.py
+++ b/nemo/collections/llm/tools/auto_configurator/base_configs/basic.py
@@ -20,10 +20,7 @@
 class Basic:
     def __init__(
         self,
-        name: str = None,
-        version: int = None,
-        size: int = None,
-        measure: str = "B",
+        model: Config = None,
         cfg: dict = {},
     ):
         """
@@ -35,19 +32,13 @@ def __init__(
             cfg (dict): auto configurator runner config.
         """
 
-        self.name = name
-        self.version = version
-        self.size = size
-        self.measure = measure
-        self.cfg = cfg
+        self.model = model
         self.num_nodes = cfg.get("num_nodes")
         self.num_gpus = cfg.get("num_gpus")
         self.max_steps = cfg.get("max_steps_per_run")
         self.seq_length = cfg.get("seq_length")
         self.global_batch_size = cfg.get("global_batch_size")
-        self.tokenizer_path = cfg.get("tokenizer_path")
         self.data_paths = cfg.get("data_paths")
-        self.nemo_run = cfg.get("nemo_run")
         self.max_minutes_per_run = cfg.get("max_minutes_per_run")
 
     def model_config(self):
@@ -73,15 +64,10 @@ def get_optim_config(self) -> OptimizerConfig:
             "overlap_param_gather": True,
         }
 
-        if self.nemo_run:
-            optim_config = Config(
-                OptimizerConfig,
-                **optim_params,
-            )
-        else:
-            optim_config = OptimizerConfig(
-                **optim_params,
-            )
+        optim_config = Config(
+            OptimizerConfig,
+            **optim_params,
+        )
 
         return optim_config
 
@@ -122,7 +108,6 @@ def get_data_config(self) -> dict:
             "seq_length": self.seq_length,
             "global_batch_size": self.global_batch_size,
             "num_workers": 2,
-            # "split": "99990,8,2",
             "index_mapping_dir": None,
         }
 
@@ -136,7 +121,7 @@ def get_run_config(self) -> dict:
         """
 
         run_config = {
-            "name": f"{self.name}_{self.size}{self.measure}",
+            "name": self.model.__class__.__name__,
             "results_dir": None,
             "time_limit": f"0-00:{self.max_minutes_per_run}:00",
         }
diff --git a/nemo/collections/llm/tools/auto_configurator/base_configs/gemma.py b/nemo/collections/llm/tools/auto_configurator/base_configs/gemma.py
index dfe774441161..29ef5d0dcad9 100644
--- a/nemo/collections/llm/tools/auto_configurator/base_configs/gemma.py
+++ b/nemo/collections/llm/tools/auto_configurator/base_configs/gemma.py
@@ -23,23 +23,16 @@
 class Gemma(Basic):
     def __init__(
         self,
-        name: str = "Gemma",
-        version: int = None,
-        size: int = 2,
-        measure: str = "B",
+        model: Config = None,
         cfg: dict = {},
     ):
         """
         Args:
-            name (str): model name.
-            version (int): model version.
-            size (int): model size.
-            measure (str): meausre of model size. "M" if model size in millions, "B" if in billions.
+            model (Config): model config.
             cfg (dict): auto configurator runner config.
         """
 
-        super().__init__(name=name, version=version, size=size, measure=measure, cfg=cfg)
-        self.config_name = f"{self.name}Config{self.size}{self.measure}"
+        super().__init__(model=model, cfg=cfg)
 
     def get_model_config(self) -> Config:
         """Function that returns model config.
@@ -48,16 +41,8 @@ def get_model_config(self) -> Config:
             Config: model config.
         """
 
-        model_class = getattr(llm, self.config_name)
-        kwargs = self.cfg.get("model_args", {})
+        self.model.global_batch_size = self.global_batch_size
+        self.model.seq_length = self.seq_length
+        self.model.pipeline_dtype = torch.bfloat16
 
-        if self.nemo_run:
-            model_config = Config(model_class, **kwargs)
-        else:
-            model_config = model_class(**kwargs)
-
-        model_config.global_batch_size = self.global_batch_size
-        model_config.seq_length = self.seq_length
-        model_config.pipeline_dtype = torch.bfloat16
-
-        return model_config
+        return self.model
diff --git a/nemo/collections/llm/tools/auto_configurator/base_configs/gpt.py b/nemo/collections/llm/tools/auto_configurator/base_configs/gpt.py
index aa8f184abf01..8942486f082c 100644
--- a/nemo/collections/llm/tools/auto_configurator/base_configs/gpt.py
+++ b/nemo/collections/llm/tools/auto_configurator/base_configs/gpt.py
@@ -21,23 +21,16 @@
 class GPT(Basic):
     def __init__(
         self,
-        name: str = "GPT",
-        version: int = 3,
-        size: int = 5,
-        measure: str = "B",
+        model: Config = None,
         cfg: dict = {},
     ):
         """
         Args:
             name (str): model name.
-            version (int): model version.
-            size (int): model size.
-            measure (str): meausre of model size. "M" if model size in millions, "B" if in billions.
             cfg (dict): auto configurator runner config.
         """
 
-        super().__init__(name=name, version=version, size=size, measure=measure, cfg=cfg)
-        self.config_name = f"{self.name}Config{self.size}{self.measure}"
+        super().__init__(model=model, cfg=cfg)
 
     def get_model_config(self) -> Config:
         """Function that returns model config.
@@ -46,15 +39,7 @@ def get_model_config(self) -> Config:
             Config: model config.
         """
 
-        model_class = getattr(llm, self.config_name)
-        kwargs = self.cfg.get("model_args", {})
+        self.model.global_batch_size = self.global_batch_size
+        self.model.seq_length = self.seq_length
 
-        if self.nemo_run:
-            model_config = Config(model_class, **kwargs)
-        else:
-            model_config = model_class(**kwargs)
-
-        model_config.global_batch_size = self.global_batch_size
-        model_config.seq_length = self.seq_length
-
-        return model_config
+        return self.model
diff --git a/nemo/collections/llm/tools/auto_configurator/base_configs/llama.py b/nemo/collections/llm/tools/auto_configurator/base_configs/llama.py
index 610e89480798..9170a641ddc9 100644
--- a/nemo/collections/llm/tools/auto_configurator/base_configs/llama.py
+++ b/nemo/collections/llm/tools/auto_configurator/base_configs/llama.py
@@ -23,23 +23,16 @@
 class Llama(Basic):
     def __init__(
         self,
-        name: str = "Llama",
-        version: int = 2,
-        size: int = 7,
-        measure: str = "B",
+        model: Config = None,
         cfg: dict = {},
     ):
         """
         Args:
-            name (str): model name.
-            version (int): model version.
-            size (int): model size.
-            measure (str): meausre of model size. "M" if model size in millions, "B" if in billions.
+            model (Config): model config.
             cfg (dict): auto configurator runner config.
         """
 
-        super().__init__(name=name, version=version, size=size, measure=measure, cfg=cfg)
-        self.config_name = f"{self.name}{self.version}Config{self.size}{self.measure}"
+        super().__init__(model=model, cfg=cfg)
 
     def get_model_config(self) -> Config:
         """Function that returns model config.
@@ -48,17 +41,8 @@ def get_model_config(self) -> Config:
             Config: model config.
         """
 
-        model_class = getattr(llm, self.config_name)
-        kwargs = self.cfg.get("model_args", {})
+        self.model.global_batch_size = self.global_batch_size
+        self.model.seq_length = self.seq_length
+        self.model.pipeline_dtype = torch.bfloat16
 
-        if self.nemo_run:
-            model_config = Config(model_class, **kwargs)
-        else:
-            model_config = model_class(**kwargs)
-
-        model_config.global_batch_size = self.global_batch_size
-        print(self.global_batch_size)
-        model_config.seq_length = self.seq_length
-        model_config.pipeline_dtype = torch.bfloat16
-
-        return model_config
+        return self.model
diff --git a/nemo/collections/llm/tools/auto_configurator/base_configs/mistral.py b/nemo/collections/llm/tools/auto_configurator/base_configs/mistral.py
index c35c7ecab4b4..7abcabfbf4ab 100644
--- a/nemo/collections/llm/tools/auto_configurator/base_configs/mistral.py
+++ b/nemo/collections/llm/tools/auto_configurator/base_configs/mistral.py
@@ -21,23 +21,16 @@
 class Mistral(Basic):
     def __init__(
         self,
-        name: str = "Mistral",
-        version: int = None,
-        size: int = 7,
-        measure: str = "B",
+        model: Config = None,
         cfg: dict = {},
     ):
         """
         Args:
-            name (str): model name.
-            version (int): model version.
-            size (int): model size.
-            measure (str): meausre of model size. "M" if model size in millions, "B" if in billions.
+            model (Config): model config.
             cfg (dict): auto configurator runner config.
         """
 
-        super().__init__(name=name, version=version, size=size, measure=measure, cfg=cfg)
-        self.config_name = f"{self.name}Config{self.size}{self.measure}"
+        super().__init__(model=model, cfg=cfg)
 
     def get_model_config(self) -> Config:
         """Function that returns model config.
@@ -46,15 +39,8 @@ def get_model_config(self) -> Config:
             Config: model config.
         """
 
-        model_class = getattr(llm, self.config_name)
-        kwargs = self.cfg.get("model_args", {})
+        self.model.global_batch_size = self.global_batch_size
+        self.model.seq_length = self.seq_length
+        self.model.pipeline_dtype = torch.bfloat16
 
-        if self.nemo_run:
-            model_config = Config(model_class, **kwargs)
-        else:
-            model_config = model_class(**kwargs)
-
-        model_config.global_batch_size = self.global_batch_size
-        model_config.seq_length = self.seq_length
-
-        return model_config
+        return self.model
diff --git a/nemo/collections/llm/tools/auto_configurator/base_configs/mixtral.py b/nemo/collections/llm/tools/auto_configurator/base_configs/mixtral.py
index 6ad0adb38e38..0ea57c62b0c4 100644
--- a/nemo/collections/llm/tools/auto_configurator/base_configs/mixtral.py
+++ b/nemo/collections/llm/tools/auto_configurator/base_configs/mixtral.py
@@ -21,23 +21,16 @@
 class Mixtral(Basic):
     def __init__(
         self,
-        name: str = "Mixtral",
-        version: int = 8,
-        size: int = 7,
-        measure: str = "B",
+        model: Config = None,
         cfg: dict = {},
     ):
         """
         Args:
-            name (str): model name.
-            version (int): model version.
-            size (int): model size.
-            measure (str): meausre of model size. "M" if model size in millions, "B" if in billions.
+            model (Config): model config.
             cfg (dict): auto configurator runner config.
         """
 
-        super().__init__(name=name, version=version, size=size, measure=measure, cfg=cfg)
-        self.config_name = f"{self.name}Config{self.version}x{self.size}{self.measure}"
+        super().__init__(model=model, cfg=cfg)
 
     def get_model_config(self) -> Config:
         """Function that returns model config.
@@ -46,15 +39,8 @@ def get_model_config(self) -> Config:
             Config: model config.
         """
 
-        model_class = getattr(llm, self.config_name)
-        kwargs = self.cfg.get("model_args", {})
+        self.model.global_batch_size = self.global_batch_size
+        self.model.seq_length = self.seq_length
+        self.model.pipeline_dtype = torch.bfloat16
 
-        if self.nemo_run:
-            model_config = Config(model_class, **kwargs)
-        else:
-            model_config = model_class(**kwargs)
-
-        model_config.global_batch_size = self.global_batch_size
-        model_config.seq_length = self.seq_length
-
-        return model_config
+        return self.model
diff --git a/nemo/collections/llm/tools/auto_configurator/core/utils.py b/nemo/collections/llm/tools/auto_configurator/core/utils.py
index 5dddb64cbcbb..78fe2640b601 100644
--- a/nemo/collections/llm/tools/auto_configurator/core/utils.py
+++ b/nemo/collections/llm/tools/auto_configurator/core/utils.py
@@ -367,7 +367,7 @@ def generic_base_config(
     elif custom_model:
         model = base_configs.custom(name=MODULES[model_name], cfg=cfg)
     else:
-        model = model_cls(version=model_version, size=model_size_in_b, measure=model_measure, cfg=cfg)
+        model = model_cls(model=cfg.get("model"), cfg=cfg)
 
     base_cfg = {
         "model": model.get_model_config(),
@@ -548,4 +548,4 @@ def modify_cfg(
             f"Valid config: SeqLen={seq_len}, GBS={gbs}, MBS={mbs}, TP={tp}, PP={pp}, CP={cp}, EP={ep}, act_ckpt_layers={act}, num_mbs_act={num_mbs_act}, act_per_pipe={act_per_pipe}. Adding to directory."
         )
         return new_cfg
-    return None
+    return None
\ No newline at end of file
diff --git a/nemo/collections/llm/tools/auto_configurator/runner.py b/nemo/collections/llm/tools/auto_configurator/runner.py
index 0c093b65f2ae..100118afd8b0 100644
--- a/nemo/collections/llm/tools/auto_configurator/runner.py
+++ b/nemo/collections/llm/tools/auto_configurator/runner.py
@@ -47,7 +47,7 @@ class AutoConfigurator:
 
     def __init__(
         self,
-        model_type: str = None,
+        model: Config = None,
         num_nodes: int = None,
         data_paths: List = None,
         path_to_logs: Optional[str] = None,
@@ -73,13 +73,11 @@ def __init__(
         max_training_days: Optional[int] = 2,
         max_steps_per_run: Optional[int] = 50,
         vocab_size: Optional[int] = 51200,
-        model_args: Optional[dict] = {},
         custom_model: Optional[bool] = False,
-        nemo_run: Optional[bool] = False,
     ):
         """
         Args:
-            model_type (str): model type to be used for training.
+            model_type (Config): model type to be used for training.
             num_nodes (int): number of nodes to be used for training.
             data_paths (List): list of datafiles to be used for training.
             path_to_logs (str): path to the directory where the logs will be stored.
@@ -105,50 +103,24 @@ def __init__(
             max_training_days (Optional[int]): number of days expected model to be trained.
             max_steps_per_run (Optional[int]): maximum number of steps per run for the grid search.
             vocab_size (Optional[int]): size of tokenizer vocabulary.
-            model_args (Optional[dict]): additional args to add to mdoel config.
             custom_model (Optional[bool]): set to True if you want to use custom model.
-            nemo_sdk (Optional[bool]): set to True if you want to run Auto Configurator with nemo-sdk.
         """
 
+        model_type = self._get_model_type(model.__class__.__name__)
         assert model_type in SUPPORTED_MODELS, f"model_type must be set to one of {SUPPORTED_MODELS}."
         assert tokenizer_type in SUPPORTED_TOKENIZERS, f"tokenizer_type must be set to one of {SUPPORTED_TOKENIZERS}."
         assert num_nodes, "num_nodes value must be specified."
         assert data_paths, "training data must be specified."
-        if nemo_run:
-            assert path_to_logs, f"path_to_logs parameter must be specified."
+        assert path_to_logs, f"path_to_logs parameter must be specified."
 
         self.config = locals()
         self.config.pop('self')
+        self.config["model_type"] = model_type
 
         # Print the config
         logging.info(self._get_message(self.config))
 
     def generate_configs(self) -> dict:
-        """
-        :return: dictionary of generated configs.
-            key: model config name, type: str.
-            value: model config values, type: dict.
-        :rtype: dict.
-        """
-
-        configs = search_configs(self.config)
-        if self.config["nemo_run"]:
-            configs = self._generate_nemo_run_configs(
-                configs,
-                self.config["tokenizer_type"],
-                self.config["tokenizer_path"],
-                self.config["path_to_logs"],
-            )
-
-        return configs
-
-    def _generate_nemo_run_configs(
-        self,
-        configs: dict,
-        tokenizer_type: str,
-        tokenizer_path: str,
-        path_to_logs: str,
-    ) -> dict:
         """
         Function that returns a dictionary of Partial configs.
         : dict config: runner config.
@@ -159,6 +131,11 @@ def _generate_nemo_run_configs(
         :rtype: dict.
         """
 
+        configs = search_configs(self.config)
+        tokenizer_type = self.config.get("tokenizer_type")
+        tokenizer_path = self.config.get("tokenizer_path")
+        path_to_logs = self.config.get("path_to_logs")
+
         tokenizer = self._get_tokenizer(tokenizer_type, tokenizer_path)
         for name, config in configs.items():
             strategy = self._get_startegy(config['auto_config'])
@@ -302,3 +279,17 @@ def _get_message(self, config: dict) -> str:
             message += f"{key}: {value}\n"
 
         return message
+
+    def _get_model_type(self, model: str) -> str:
+        if "GPT" in model:
+            return "gpt3"
+        elif "Llama" in model:
+            return "Llama"
+        elif "Mixtral" in model:
+            return "mixtral"
+        elif "Mistral" in model:
+            return "mistral"
+        elif "Gemma" in model:
+            return "gemma"
+        else:
+            return None
\ No newline at end of file

From bb86c39a4f2be520911f743a3c3cea6cdf4229da Mon Sep 17 00:00:00 2001
From: dimapihtar <dimapihtar@users.noreply.github.com>
Date: Wed, 28 Aug 2024 15:40:48 +0000
Subject: [PATCH 21/63] Apply isort and black reformatting

Signed-off-by: dimapihtar <dimapihtar@users.noreply.github.com>
---
 nemo/collections/llm/tools/auto_configurator/core/utils.py | 2 +-
 nemo/collections/llm/tools/auto_configurator/runner.py     | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/nemo/collections/llm/tools/auto_configurator/core/utils.py b/nemo/collections/llm/tools/auto_configurator/core/utils.py
index 78fe2640b601..af331487007a 100644
--- a/nemo/collections/llm/tools/auto_configurator/core/utils.py
+++ b/nemo/collections/llm/tools/auto_configurator/core/utils.py
@@ -548,4 +548,4 @@ def modify_cfg(
             f"Valid config: SeqLen={seq_len}, GBS={gbs}, MBS={mbs}, TP={tp}, PP={pp}, CP={cp}, EP={ep}, act_ckpt_layers={act}, num_mbs_act={num_mbs_act}, act_per_pipe={act_per_pipe}. Adding to directory."
         )
         return new_cfg
-    return None
\ No newline at end of file
+    return None
diff --git a/nemo/collections/llm/tools/auto_configurator/runner.py b/nemo/collections/llm/tools/auto_configurator/runner.py
index 100118afd8b0..05312e872d4b 100644
--- a/nemo/collections/llm/tools/auto_configurator/runner.py
+++ b/nemo/collections/llm/tools/auto_configurator/runner.py
@@ -292,4 +292,4 @@ def _get_model_type(self, model: str) -> str:
         elif "Gemma" in model:
             return "gemma"
         else:
-            return None
\ No newline at end of file
+            return None

From a2099afd334c28294d27cd3ff9e3d86e68746cf5 Mon Sep 17 00:00:00 2001
From: dimapihtar <dpihtar@gmail.com>
Date: Wed, 28 Aug 2024 10:17:55 -0700
Subject: [PATCH 22/63] add nemotron support

Signed-off-by: dimapihtar <dpihtar@gmail.com>
---
 .../base_configs/__init__.py                  |  2 +-
 .../auto_configurator/base_configs/custom.py  | 45 ----------
 .../base_configs/nemotron.py                  | 48 +++++++++++
 .../auto_configurator/core/base_config.py     |  4 +-
 .../auto_configurator/core/search_config.py   |  1 +
 .../auto_configurator/core/training_config.py | 62 ++++++--------
 .../llm/tools/auto_configurator/core/utils.py | 82 ++++++-------------
 .../llm/tools/auto_configurator/runner.py     |  8 +-
 8 files changed, 101 insertions(+), 151 deletions(-)
 delete mode 100644 nemo/collections/llm/tools/auto_configurator/base_configs/custom.py
 create mode 100644 nemo/collections/llm/tools/auto_configurator/base_configs/nemotron.py

diff --git a/nemo/collections/llm/tools/auto_configurator/base_configs/__init__.py b/nemo/collections/llm/tools/auto_configurator/base_configs/__init__.py
index 9aca9661c6a8..07f38425b69e 100644
--- a/nemo/collections/llm/tools/auto_configurator/base_configs/__init__.py
+++ b/nemo/collections/llm/tools/auto_configurator/base_configs/__init__.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from nemo.collections.llm.tools.auto_configurator.base_configs.custom import custom
+from nemo.collections.llm.tools.auto_configurator.base_configs.nemotron import NeMotron
 from nemo.collections.llm.tools.auto_configurator.base_configs.gemma import Gemma
 from nemo.collections.llm.tools.auto_configurator.base_configs.gpt import GPT
 from nemo.collections.llm.tools.auto_configurator.base_configs.llama import Llama
diff --git a/nemo/collections/llm/tools/auto_configurator/base_configs/custom.py b/nemo/collections/llm/tools/auto_configurator/base_configs/custom.py
deleted file mode 100644
index 9bcb6ef45777..000000000000
--- a/nemo/collections/llm/tools/auto_configurator/base_configs/custom.py
+++ /dev/null
@@ -1,45 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from nemo.collections.llm.tools.auto_configurator import base_configs
-
-from .basic import Basic
-
-
-def custom(name, cfg):
-    """Function that return custom model class.
-
-    Args:
-        name (srt): model type.
-        cfg (dict): auto configurator runner config.
-
-    Returns
-        Custom: class object.
-    """
-
-    basic_class = getattr(base_configs, name)
-
-    class Custom(basic_class):
-        def __init__(self, name, cfg):
-            """
-            Args:
-                name (srt): model type.
-                cfg (dict): auto configurator runner config.
-            """
-
-            super().__init__(name=name, cfg=cfg)
-
-    custom_class = Custom(name, cfg)
-
-    return custom_class
diff --git a/nemo/collections/llm/tools/auto_configurator/base_configs/nemotron.py b/nemo/collections/llm/tools/auto_configurator/base_configs/nemotron.py
new file mode 100644
index 000000000000..766503f09b8c
--- /dev/null
+++ b/nemo/collections/llm/tools/auto_configurator/base_configs/nemotron.py
@@ -0,0 +1,48 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+
+from nemo.collections import llm
+from nemo.collections.llm.utils import Config
+
+from .basic import Basic
+
+
+class NeMotron(Basic):
+    def __init__(
+        self,
+        model: Config = None,
+        cfg: dict = {},
+    ):
+        """
+        Args:
+            model (Config): model config.
+            cfg (dict): auto configurator runner config.
+        """
+
+        super().__init__(model=model, cfg=cfg)
+
+    def get_model_config(self) -> Config:
+        """Function that returns model config.
+
+        Returns:
+            Config: model config.
+        """
+
+        self.model.global_batch_size = self.global_batch_size
+        self.model.seq_length = self.seq_length
+        self.model.pipeline_dtype = torch.bfloat16
+
+        return self.model
diff --git a/nemo/collections/llm/tools/auto_configurator/core/base_config.py b/nemo/collections/llm/tools/auto_configurator/core/base_config.py
index c47bc6b894a7..a0c98f07caa7 100644
--- a/nemo/collections/llm/tools/auto_configurator/core/base_config.py
+++ b/nemo/collections/llm/tools/auto_configurator/core/base_config.py
@@ -97,7 +97,7 @@ def _estimate_model_size(
     """
 
     model_penalty = 0.87 if model_name == "mt5" else 1.0
-    valid_models = ["gpt3", "t5", "mt5", "bert", "llama", "mixtral", "mistral", "gemma"]
+    valid_models = ["gpt3", "t5", "mt5", "bert", "llama", "mixtral", "mistral", "gemma", "nemotron"]
     try:
         if model_name in valid_models:
             return round(
@@ -142,7 +142,7 @@ def _estimate_training_time(
     """
 
     model_penalty = 1.15 if model_name == "mt5" else 1.0
-    valid_models = ["gpt3", "t5", "mt5", "bert", "llama", "mixtral", "mistral", "gemma"]
+    valid_models = ["gpt3", "t5", "mt5", "bert", "llama", "mixtral", "mistral", "gemma", "nemotron"]
     try:
         if model_name in valid_models:
             return round(
diff --git a/nemo/collections/llm/tools/auto_configurator/core/search_config.py b/nemo/collections/llm/tools/auto_configurator/core/search_config.py
index 0fd2492b89d8..62db570d4122 100644
--- a/nemo/collections/llm/tools/auto_configurator/core/search_config.py
+++ b/nemo/collections/llm/tools/auto_configurator/core/search_config.py
@@ -24,6 +24,7 @@
     "mixtral",
     "mistral",
     "gemma",
+    "nemotron",
 ]
 
 
diff --git a/nemo/collections/llm/tools/auto_configurator/core/training_config.py b/nemo/collections/llm/tools/auto_configurator/core/training_config.py
index cc4f9f6d9f7b..63477028d267 100644
--- a/nemo/collections/llm/tools/auto_configurator/core/training_config.py
+++ b/nemo/collections/llm/tools/auto_configurator/core/training_config.py
@@ -21,6 +21,20 @@
 from nemo.collections.llm.tools.auto_configurator.core import utils
 
 
+GPT_BASED_MODELS = [
+    "gpt3",
+    "bert",
+    "llama",
+    "baichuan2",
+    "chatglm",
+    "qwen2",
+    "mixtral",
+    "mistral",
+    "gemma",
+    "nemotron",
+]
+
+
 def generate_grid_search_configs(
     base_cfg: dict,
     train_cfg: dict,
@@ -43,28 +57,18 @@ def generate_grid_search_configs(
     # 2 * num_layers is needed because of encoder/decoder architecture.
     multiplier = (
         1
-        if model_name in ["gpt3", "bert", "llama", "baichuan2", "chatglm", "qwen2", "mixtral", "mistral", "gemma"]
+        if model_name in GPT_BASED_MODELS
         else 2
     )
 
     seq_length = base_cfg["model"].seq_length
     num_layers = (
         base_cfg["model"].num_layers
-        if model_name in ["gpt3", "bert", "llama", "baichuan2", "chatglm", "qwen2", "mixtral", "mistral", "gemma"]
+        if model_name in GPT_BASED_MODELS
         else base_cfg["model"].encoder.num_layers
     )
 
-    if model_name in [
-        "gpt3",
-        "bert",
-        "llama",
-        "baichuan2",
-        "chatglm",
-        "qwen2",
-        "mixtral",
-        "mistral",
-        "gemma",
-    ]:
+    if model_name in GPT_BASED_MODELS:
         act_method = base_cfg["model"].activations_checkpoint_method
     else:
         act_method = base_cfg["model"].encoder.activations_checkpoint_method
@@ -89,17 +93,7 @@ def generate_grid_search_configs(
                     for mbs in params.mbs:
                         num_gpus = base_cfg["trainer"]["num_nodes"] * base_cfg["trainer"]["devices"]
                         base_cfg["model"].global_batch_size = params.gbs
-                        if model_name in [
-                            "gpt3",
-                            "bert",
-                            "llama",
-                            "baichuan2",
-                            "chatglm",
-                            "qwen2",
-                            "mixtral",
-                            "mistral",
-                            "gemma",
-                        ]:
+                        if model_name in GPT_BASED_MODELS:
                             att_heads = base_cfg["model"].num_attention_heads
                             num_layers = base_cfg["model"].num_layers
                         else:
@@ -205,7 +199,7 @@ def _set_activations_checkpoint_params(
     max_layers_per_pipe = num_layers
     interval_layers_per_pipe = act_multiple
     if (
-        model_name in ["gpt3", "bert", "llama", "baichuan2", "chatglm", "qwen2", "mixtral", "mistral", "gemma"]
+        model_name in GPT_BASED_MODELS
         and pp > 2
     ):  # Interleaved pipeline scheduling.
         virtual_pipelines = num_layers // pp  # TODO: verify that this is the best value.
@@ -226,17 +220,7 @@ def _set_activations_checkpoint_params(
         else:
             act_ckpt_layers = range(0, multiplier * num_layers // pp // virtual_pipelines + 1, act_multiple)
 
-        if pp > 1 and model_name in [
-            "gpt3",
-            "bert",
-            "llama",
-            "baichuan2",
-            "chatglm",
-            "qwen2",
-            "mixtral",
-            "mistral",
-            "gemma",
-        ]:
+        if pp > 1 and model_name in GPT_BASED_MODELS:
             # Num micro batches with partial act ckpt
             num_micro_batches_partial_act_ckpt = list(range(min_micro_b, max_micro_b + 1, interval_micro_b))
             if num_micro_batches_partial_act_ckpt[0] == 0:
@@ -886,11 +870,11 @@ def _calculate_tp_pp_mbs_grid(
     model_measure = train_cfg.get("model_measure")
     multiplier = (
         1
-        if model_name in ["gpt3", "bert", "llama", "baichuan2", "chatglm", "qwen2", "mixtral", "mistral", "gemma"]
+        if model_name in GPT_BASED_MODELS
         else 2
     )
     init_pp = (
-        [] if model_name in ["gpt3", "llama", "baichuan2", "chatglm", "qwen2", "mixtral", "mistral", "gemma"] else [1]
+        [] if model_name in GPT_BASED_MODELS else [1]
     )
     valid_pp = init_pp + [
         multiplier * x for x in range(1, num_layers + 1) if num_layers % x == 0
@@ -904,7 +888,7 @@ def _calculate_tp_pp_mbs_grid(
         "gpu_memory_gb": gpu_memory_gb,
     }
 
-    if model_name in ["gpt3", "llama", "baichuan2", "chatglm", "qwen2", "mixtral", "mistral", "gemma"]:
+    if model_name in GPT_BASED_MODELS:
         search_class = GPT3GridSearch
     elif model_name in ["t5", "mt5"]:
         search_class = T5GridSearch
diff --git a/nemo/collections/llm/tools/auto_configurator/core/utils.py b/nemo/collections/llm/tools/auto_configurator/core/utils.py
index af331487007a..67579e821599 100644
--- a/nemo/collections/llm/tools/auto_configurator/core/utils.py
+++ b/nemo/collections/llm/tools/auto_configurator/core/utils.py
@@ -24,8 +24,22 @@
     "mixtral": "Mixtral",
     "mistral": "Mistral",
     "gemma": "Gemma",
+    "nemotron": "NeMotron",
 }
 
+GPT_BASED_MODELS = [
+    "gpt3",
+    "bert",
+    "llama",
+    "baichuan2",
+    "chatglm",
+    "qwen2",
+    "mixtral",
+    "mistral",
+    "gemma",
+    "nemotron",
+]
+
 
 @dataclass
 class ModelSizeParams:
@@ -58,7 +72,7 @@ class ModelSizeParams:
     def init_params(self):
         model_name = self.model_name
         model_size_in_b = self.model_size_in_b
-        if model_name in ["gpt3", "llama", "baichuan2", "chatglm", "qwen2", "mixtral", "mistral", "gemma"]:
+        if model_name in GPT_BASED_MODELS:
             self.ffn = 4 * self.hs
             if model_size_in_b < 0.25:
                 self.hs, self.att_h, self.lr = 768, 12, 6e-4
@@ -297,7 +311,7 @@ def _calculate_model_size(
         NotImplementedError: if the model name is not valid.
     """
 
-    if model_name in ["gpt3", "llama", "baichuan2", "chatglm", "qwen2", "mixtral", "mistral", "gemma"]:
+    if model_name in GPT_BASED_MODELS:
         model_size = (
             12
             * num_layers
@@ -385,7 +399,7 @@ def generic_base_config(
             model_name,
         ).init_params()
 
-        if model_name in ["gpt3", "llama", "baichuan2", "chatglm", "qwen2", "mixtral", "mistral", "gemma"]:
+        if model_name in GPT_BASED_MODELS:
             base_cfg["model"].num_layers = params.layers
             base_cfg["model"].hidden_size = params.hs
             base_cfg["model"].num_attention_heads = params.att_h
@@ -437,62 +451,22 @@ def modify_cfg(
     Returns:
         dict: dictionary containing the updated model configuration parameters.
     """
-
+    
     new_cfg = copy.deepcopy(base_cfg)
     if act is not None:
-        if model_name in [
-            "gpt3",
-            "bert",
-            "llama",
-            "baichuan2",
-            "chatglm",
-            "qwen2",
-            "mixtral",
-            "mistral",
-            "gemma",
-        ]:
+        if model_name in GPT_BASED_MODELS:
             new_cfg["auto_config"]["activations_checkpoint_num_layers"] = act
         else:
             new_cfg["auto_config"]["encoder"]["activations_checkpoint_num_layers"] = act // 2
             new_cfg["auto_config"]["decoder"]["activations_checkpoint_num_layers"] = act // 2
 
-    if num_mbs_act is not None and model_name in [
-        "gpt3",
-        "bert",
-        "llama",
-        "baichuan2",
-        "chatglm",
-        "qwen2",
-        "mixtral",
-        "mistral",
-        "gemma",
-    ]:
+    if num_mbs_act is not None and model_name in GPT_BASED_MODELS:
         new_cfg["auto_config"]["num_micro_batches_with_partial_activation_checkpoints"] = num_mbs_act
 
-    if act_per_pipe is not None and model_name in [
-        "gpt3",
-        "bert",
-        "llama",
-        "baichuan2",
-        "chatglm",
-        "qwen2",
-        "mixtral",
-        "mistral",
-        "gemma",
-    ]:
+    if act_per_pipe is not None and model_name in GPT_BASED_MODELS:
         new_cfg["auto_config"]["activations_checkpoint_layers_per_pipeline"] = act_per_pipe
 
-    if virtual_pipelines is not None and model_name in [
-        "gpt3",
-        "bert",
-        "llama",
-        "baichuan2",
-        "chatglm",
-        "qwen2",
-        "mixtral",
-        "mistral",
-        "gemma",
-    ]:
+    if virtual_pipelines is not None and model_name in GPT_BASED_MODELS:
         new_cfg["auto_config"]["virtual_pipeline_model_parallel_size"] = virtual_pipelines
 
     new_cfg["auto_config"]["tensor_model_parallel_size"] = tp
@@ -506,17 +480,7 @@ def modify_cfg(
     if ep is not None:
         new_cfg["auto_config"]["expert_model_parallel_size"] = ep
 
-    if model_name in [
-        "gpt3",
-        "bert",
-        "llama",
-        "baichuan2",
-        "chatglm",
-        "qwen2",
-        "mixtral",
-        "mistral",
-        "gemma",
-    ]:
+    if model_name in GPT_BASED_MODELS:
         att_heads = new_cfg["model"].num_attention_heads
         num_layers = new_cfg["model"].num_layers
     else:
diff --git a/nemo/collections/llm/tools/auto_configurator/runner.py b/nemo/collections/llm/tools/auto_configurator/runner.py
index 05312e872d4b..6d5595fffb5a 100644
--- a/nemo/collections/llm/tools/auto_configurator/runner.py
+++ b/nemo/collections/llm/tools/auto_configurator/runner.py
@@ -33,6 +33,7 @@
     "mixtral",
     "mistral",
     "gemma",
+    "nemotron",
 ]
 
 SUPPORTED_TOKENIZERS = [
@@ -54,7 +55,6 @@ def __init__(
         tokenizer_type: Optional[str] = "autotokenizer",
         tokenizer_path: Optional[str] = "GPT2BPETokenizer",
         model_size: Optional[int] = None,
-        model_version: Optional[int] = None,
         gpus_per_node: Optional[int] = 8,
         gpu_memory_gb: Optional[int] = 80,
         model_measure: Optional[str] = "B",
@@ -73,7 +73,6 @@ def __init__(
         max_training_days: Optional[int] = 2,
         max_steps_per_run: Optional[int] = 50,
         vocab_size: Optional[int] = 51200,
-        custom_model: Optional[bool] = False,
     ):
         """
         Args:
@@ -84,10 +83,8 @@ def __init__(
             tokenizer_type (Optional[str]): tokenizer type.
             tokenizer_path (Optional[str]): path to the tokenizer model.
             model_size (Optional[int]): size of model to be trained.
-            model_version (Optional[int]): version of model. 3 for GPT3, 2 for Llama2.
             gpus_per_node (Optional[int]): number of GPUs per node to be used.
             gpu_memory_gb (Optional[int]): memory per GPU, in GB. Currently 40GB and 80GB A100s/H100s supported.
-            model_measure (Optional[str]): "M" if model_size is specified in millions. "B" if in billions.
             seq_length (Optional[int]): model sequence length. Available seq_length list for GPT-based models: [2048, 4096, 8192, 16384, 32768].
             global_batch_size (Optional[int]): model global batch size. Set to "auto" if you want auto configurator to find optimal gbs.
             tensor_parallel_sizes (Optional[List[int]]): set to "auto" to use our recommendation, or a list, such as [1, 2, 4, 8].
@@ -103,7 +100,6 @@ def __init__(
             max_training_days (Optional[int]): number of days expected model to be trained.
             max_steps_per_run (Optional[int]): maximum number of steps per run for the grid search.
             vocab_size (Optional[int]): size of tokenizer vocabulary.
-            custom_model (Optional[bool]): set to True if you want to use custom model.
         """
 
         model_type = self._get_model_type(model.__class__.__name__)
@@ -291,5 +287,7 @@ def _get_model_type(self, model: str) -> str:
             return "mistral"
         elif "Gemma" in model:
             return "gemma"
+        elif "Nemotron" in model:
+            return "nemotron"
         else:
             return None

From 86694e6d320b45d0045e1f91361d65c9a7df8195 Mon Sep 17 00:00:00 2001
From: dimapihtar <dimapihtar@users.noreply.github.com>
Date: Wed, 28 Aug 2024 17:18:51 +0000
Subject: [PATCH 23/63] Apply isort and black reformatting

Signed-off-by: dimapihtar <dimapihtar@users.noreply.github.com>
---
 .../base_configs/__init__.py                  |  2 +-
 .../auto_configurator/core/training_config.py | 25 ++++---------------
 .../llm/tools/auto_configurator/core/utils.py |  2 +-
 3 files changed, 7 insertions(+), 22 deletions(-)

diff --git a/nemo/collections/llm/tools/auto_configurator/base_configs/__init__.py b/nemo/collections/llm/tools/auto_configurator/base_configs/__init__.py
index 07f38425b69e..632adbf1142a 100644
--- a/nemo/collections/llm/tools/auto_configurator/base_configs/__init__.py
+++ b/nemo/collections/llm/tools/auto_configurator/base_configs/__init__.py
@@ -12,9 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from nemo.collections.llm.tools.auto_configurator.base_configs.nemotron import NeMotron
 from nemo.collections.llm.tools.auto_configurator.base_configs.gemma import Gemma
 from nemo.collections.llm.tools.auto_configurator.base_configs.gpt import GPT
 from nemo.collections.llm.tools.auto_configurator.base_configs.llama import Llama
 from nemo.collections.llm.tools.auto_configurator.base_configs.mistral import Mistral
 from nemo.collections.llm.tools.auto_configurator.base_configs.mixtral import Mixtral
+from nemo.collections.llm.tools.auto_configurator.base_configs.nemotron import NeMotron
diff --git a/nemo/collections/llm/tools/auto_configurator/core/training_config.py b/nemo/collections/llm/tools/auto_configurator/core/training_config.py
index 63477028d267..5c83cb6c1104 100644
--- a/nemo/collections/llm/tools/auto_configurator/core/training_config.py
+++ b/nemo/collections/llm/tools/auto_configurator/core/training_config.py
@@ -55,17 +55,11 @@ def generate_grid_search_configs(
     model_measure = train_cfg.get("model_measure")
 
     # 2 * num_layers is needed because of encoder/decoder architecture.
-    multiplier = (
-        1
-        if model_name in GPT_BASED_MODELS
-        else 2
-    )
+    multiplier = 1 if model_name in GPT_BASED_MODELS else 2
 
     seq_length = base_cfg["model"].seq_length
     num_layers = (
-        base_cfg["model"].num_layers
-        if model_name in GPT_BASED_MODELS
-        else base_cfg["model"].encoder.num_layers
+        base_cfg["model"].num_layers if model_name in GPT_BASED_MODELS else base_cfg["model"].encoder.num_layers
     )
 
     if model_name in GPT_BASED_MODELS:
@@ -198,10 +192,7 @@ def _set_activations_checkpoint_params(
     min_layers_per_pipe = 0
     max_layers_per_pipe = num_layers
     interval_layers_per_pipe = act_multiple
-    if (
-        model_name in GPT_BASED_MODELS
-        and pp > 2
-    ):  # Interleaved pipeline scheduling.
+    if model_name in GPT_BASED_MODELS and pp > 2:  # Interleaved pipeline scheduling.
         virtual_pipelines = num_layers // pp  # TODO: verify that this is the best value.
         act_multiple = 1
         max_micro_b = pp * (virtual_pipelines - 1) + (pp - 1) * 2 + 1
@@ -868,14 +859,8 @@ def _calculate_tp_pp_mbs_grid(
     gbs_size = train_cfg.get("global_batch_size")
     gpu_memory_gb = train_cfg.get("gpu_memory_gb")
     model_measure = train_cfg.get("model_measure")
-    multiplier = (
-        1
-        if model_name in GPT_BASED_MODELS
-        else 2
-    )
-    init_pp = (
-        [] if model_name in GPT_BASED_MODELS else [1]
-    )
+    multiplier = 1 if model_name in GPT_BASED_MODELS else 2
+    init_pp = [] if model_name in GPT_BASED_MODELS else [1]
     valid_pp = init_pp + [
         multiplier * x for x in range(1, num_layers + 1) if num_layers % x == 0
     ]  # Only divisors of num_layers are possible.
diff --git a/nemo/collections/llm/tools/auto_configurator/core/utils.py b/nemo/collections/llm/tools/auto_configurator/core/utils.py
index 67579e821599..9d51f1cb2b4c 100644
--- a/nemo/collections/llm/tools/auto_configurator/core/utils.py
+++ b/nemo/collections/llm/tools/auto_configurator/core/utils.py
@@ -451,7 +451,7 @@ def modify_cfg(
     Returns:
         dict: dictionary containing the updated model configuration parameters.
     """
-    
+
     new_cfg = copy.deepcopy(base_cfg)
     if act is not None:
         if model_name in GPT_BASED_MODELS:

From 0b896b7b79d14a72f96813b8f764c5df564dd905 Mon Sep 17 00:00:00 2001
From: dimapihtar <dpihtar@gmail.com>
Date: Mon, 2 Sep 2024 09:27:14 -0700
Subject: [PATCH 24/63] remove search_config.py

Signed-off-by: dimapihtar <dpihtar@gmail.com>
---
 .../auto_configurator/core/search_config.py   | 85 -------------------
 .../llm/tools/auto_configurator/core/utils.py |  4 -
 .../llm/tools/auto_configurator/runner.py     | 26 +++++-
 3 files changed, 24 insertions(+), 91 deletions(-)
 delete mode 100644 nemo/collections/llm/tools/auto_configurator/core/search_config.py

diff --git a/nemo/collections/llm/tools/auto_configurator/core/search_config.py b/nemo/collections/llm/tools/auto_configurator/core/search_config.py
deleted file mode 100644
index 62db570d4122..000000000000
--- a/nemo/collections/llm/tools/auto_configurator/core/search_config.py
+++ /dev/null
@@ -1,85 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-from typing import Optional
-
-from nemo.collections.llm.tools.auto_configurator.core.training_config import generate_grid_search_configs
-from nemo.collections.llm.tools.auto_configurator.core.utils import generic_base_config
-
-SUPPORTED_MODELS = [
-    "gpt3",
-    "llama",
-    "mixtral",
-    "mistral",
-    "gemma",
-    "nemotron",
-]
-
-
-def search_configs(cfg: dict):
-    """
-    Main function that implements the entire pipeline to search the optimal
-    model config and launch the grid searches for both training and inference
-    constraints.
-    :param dict cfg: main hydra config object for the auto configurator.
-    :return: dictionary of generated configs.
-    :rtype: dict
-    """
-
-    # Read config
-    num_nodes = cfg.get("num_nodes")
-    gpus_per_node = cfg.get("gpus_per_node", 8)
-    gpu_memory_gb = cfg.get("gpu_memory_gb", 80)
-    max_training_days = cfg.get("max_training_days", 2)
-    max_minutes_per_run = cfg.get("max_minutes_per_run", 30)
-    model_name = cfg.get("model_type")
-    model_version = cfg.get("model_version")
-    model_size_in_b = cfg.get("model_size")
-    model_measure = cfg.get("model_measure", "B")
-    vocab_size = cfg.get("vocab_size", 32000)
-    tflops_per_gpu = cfg.get("tflops_per_gpu", 140)
-    num_tokens_in_b = cfg.get("num_tokens_in_b", 300)
-    seq_length = cfg.get("seq_length", 2048)
-    global_batch_size = cfg.get("global_batch_size")
-
-    assert model_name in SUPPORTED_MODELS, f"model must be set to one of {SUPPORTED_MODELS}"
-
-    gpu_count = num_nodes * gpus_per_node
-    assert isinstance(gpu_count, int) and gpu_count > 0, "num_nodes * gpus_per_node must be an int larger than zero."
-    assert isinstance(gpu_memory_gb, int) and gpu_memory_gb in (
-        40,
-        80,
-    ), "gpu_memory_gb can only be 40 or 80."
-    assert (
-        isinstance(max_minutes_per_run, int) and max_minutes_per_run >= 10
-    ), "max_minutes_per_run must be an int and be at least 10 minutes."
-
-    cfg["model_size_in_b"] = model_size_in_b
-    cfg["gpu_count"] = gpu_count
-    cfg["num_gpus"] = gpus_per_node
-
-    # Generate base config for the given model size
-    base_cfg, train_cfg = generic_base_config(
-        model_name=model_name,
-        model_version=model_version,
-        model_size_in_b=model_size_in_b,
-        model_measure=model_measure,
-        cfg=cfg,
-    )
-
-    # Launch grid search for training constraints
-    configs = generate_grid_search_configs(base_cfg, train_cfg)
-
-    return configs
diff --git a/nemo/collections/llm/tools/auto_configurator/core/utils.py b/nemo/collections/llm/tools/auto_configurator/core/utils.py
index 9d51f1cb2b4c..b110d23ee4b3 100644
--- a/nemo/collections/llm/tools/auto_configurator/core/utils.py
+++ b/nemo/collections/llm/tools/auto_configurator/core/utils.py
@@ -342,18 +342,14 @@ def _calculate_model_size(
 
 def generic_base_config(
     model_name: str = "llama",
-    model_version: int = 2,
     model_size_in_b: int = 7,
-    model_measure: str = "B",
     cfg: dict = {},
 ) -> dict:
     """Generates a base config dictionary from a base config python file.
 
     Args:
         model_name (str): name of the model, i.e. gpt3, t5, mt5...
-        model_version (int): version of model.
         model_size_in_b (int): model size.
-        model_measure (str): model measure. Billions if "B", millions if "M".
         cfg (dict): dict config object for the Auto Configurator tool.
 
     Returns:
diff --git a/nemo/collections/llm/tools/auto_configurator/runner.py b/nemo/collections/llm/tools/auto_configurator/runner.py
index 6d5595fffb5a..7d0f1dedbea7 100644
--- a/nemo/collections/llm/tools/auto_configurator/runner.py
+++ b/nemo/collections/llm/tools/auto_configurator/runner.py
@@ -21,7 +21,8 @@
 from nemo.collections.common.tokenizers import AutoTokenizer, SentencePieceTokenizer
 from nemo.collections.llm import GPTModel, PreTrainingDataModule
 from nemo.collections.llm.api import pretrain
-from nemo.collections.llm.tools.auto_configurator.core.search_config import search_configs
+from nemo.collections.llm.tools.auto_configurator.core.training_config import generate_grid_search_configs
+from nemo.collections.llm.tools.auto_configurator.core.utils import generic_base_config
 from nemo.collections.llm.utils import Config, Partial
 from nemo.lightning.pytorch.optim import CosineAnnealingScheduler, MegatronOptimizerModule
 from nemo.utils import logging
@@ -108,10 +109,22 @@ def __init__(
         assert num_nodes, "num_nodes value must be specified."
         assert data_paths, "training data must be specified."
         assert path_to_logs, f"path_to_logs parameter must be specified."
+        gpu_count = num_nodes * gpus_per_node
+        assert gpu_count > 0, "num_nodes * gpus_per_node must be an int larger than zero."
+        assert gpu_memory_gb in (
+            40,
+            80,
+        ), "gpu_memory_gb can only be 40 or 80."
+        assert (
+            max_minutes_per_run >= 10
+        ), "max_minutes_per_run must be an int and be at least 10 minutes."
 
         self.config = locals()
         self.config.pop('self')
         self.config["model_type"] = model_type
+        self.config["model_size_in_b"] = model_size
+        self.config["gpu_count"] = gpu_count
+        self.config["num_gpus"] = gpus_per_node
 
         # Print the config
         logging.info(self._get_message(self.config))
@@ -127,7 +140,16 @@ def generate_configs(self) -> dict:
         :rtype: dict.
         """
 
-        configs = search_configs(self.config)
+        # Generate base config for the given model size
+        base_cfg, train_cfg = generic_base_config(
+            model_name=self.config["model_type"],
+            model_size_in_b=self.config["model_size"],
+            cfg=self.config,
+        )
+
+        # Launch grid search for training constraints
+        configs = generate_grid_search_configs(base_cfg, train_cfg)
+
         tokenizer_type = self.config.get("tokenizer_type")
         tokenizer_path = self.config.get("tokenizer_path")
         path_to_logs = self.config.get("path_to_logs")

From 2d062b04fc4adb0b05ffeab4ca1859504d665afd Mon Sep 17 00:00:00 2001
From: dimapihtar <dimapihtar@users.noreply.github.com>
Date: Mon, 2 Sep 2024 16:28:14 +0000
Subject: [PATCH 25/63] Apply isort and black reformatting

Signed-off-by: dimapihtar <dimapihtar@users.noreply.github.com>
---
 nemo/collections/llm/tools/auto_configurator/runner.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/nemo/collections/llm/tools/auto_configurator/runner.py b/nemo/collections/llm/tools/auto_configurator/runner.py
index 7d0f1dedbea7..9631eafb887d 100644
--- a/nemo/collections/llm/tools/auto_configurator/runner.py
+++ b/nemo/collections/llm/tools/auto_configurator/runner.py
@@ -115,9 +115,7 @@ def __init__(
             40,
             80,
         ), "gpu_memory_gb can only be 40 or 80."
-        assert (
-            max_minutes_per_run >= 10
-        ), "max_minutes_per_run must be an int and be at least 10 minutes."
+        assert max_minutes_per_run >= 10, "max_minutes_per_run must be an int and be at least 10 minutes."
 
         self.config = locals()
         self.config.pop('self')

From 1e3011887d4506585e15f62cf9be7880a6cc7506 Mon Sep 17 00:00:00 2001
From: dimapihtar <dpihtar@gmail.com>
Date: Tue, 3 Sep 2024 02:45:14 -0700
Subject: [PATCH 26/63] move configs creation to Basic class

Signed-off-by: dimapihtar <dpihtar@gmail.com>
---
 .../auto_configurator/base_configs/basic.py   | 81 ++++++++++++++++++-
 .../llm/tools/auto_configurator/runner.py     | 21 -----
 2 files changed, 77 insertions(+), 25 deletions(-)

diff --git a/nemo/collections/llm/tools/auto_configurator/base_configs/basic.py b/nemo/collections/llm/tools/auto_configurator/base_configs/basic.py
index 33a210208ca6..f6f335245cc9 100644
--- a/nemo/collections/llm/tools/auto_configurator/base_configs/basic.py
+++ b/nemo/collections/llm/tools/auto_configurator/base_configs/basic.py
@@ -14,7 +14,10 @@
 
 from megatron.core.optimizer import OptimizerConfig
 
+from nemo.lightning.pytorch.optim import CosineAnnealingScheduler, MegatronOptimizerModule
+from nemo.collections.llm import GPTModel, PreTrainingDataModule
 from nemo.collections.llm.utils import Config
+from nemo import lightning as nl
 
 
 class Basic:
@@ -69,13 +72,24 @@ def get_optim_config(self) -> OptimizerConfig:
             **optim_params,
         )
 
-        return optim_config
+        sched = Config(
+            CosineAnnealingScheduler,
+            warmup_steps=10,
+            constant_steps=0,
+            min_lr=optim_config.min_lr,
+        )
+
+        return Config(
+            MegatronOptimizerModule,
+            config=optim_config,
+            lr_scheduler=sched,
+        )
 
     def get_trainer_config(self) -> dict:
         """Function that returns config for PTL trainer.
 
         Returns:
-            dict: trainer config.
+            Config: trainer config.
         """
 
         trainer_config = {
@@ -93,10 +107,37 @@ def get_trainer_config(self) -> dict:
             "max_steps": self.max_steps,
             "val_check_interval": self.max_steps,
         }
+        
+        strategy = Config(
+            nl.MegatronStrategy,
+            pipeline_dtype=torch.bfloat16,
+        )
+
+        return Config(
+            nl.Trainer,
+            **trainer_config,
+            strategy=strategy,
+            plugins=Config(nl.MegatronMixedPrecision, precision="bf16-mixed"),
+            callbacks=[Config(TimingCallback)],
+        )
 
         return trainer_config
 
-    def get_data_config(self) -> dict:
+    def get_tokenizer(self, tokenizer_type: str, tokenizer_path: str) -> Config:
+        """
+        Function that returns the tokenizer config.
+        : str tokenizer_type: tokenizer type.
+        : str tokenizer_path: path to the tokenizer.
+        :return: tokenizer config.
+        :rtype: Config.
+        """
+
+        if tokenizer_type == "sentencepiece":
+            return Config(SentencePieceTokenizer, model_path=tokenizer_path)
+        else:
+            return Config(AutoTokenizer, pretrained_model_name=tokenizer_path)
+
+    def get_data(self) -> dict:
         """Function that returns dataset config.
 
         Returns:
@@ -111,7 +152,39 @@ def get_data_config(self) -> dict:
             "index_mapping_dir": None,
         }
 
-        return data_config
+        return Config(
+            PreTrainingDataModule,
+            **data_config,
+            tokenizer=tokenizer_config,
+        )
+    
+    def get_logger(self, run_name: str, path_to_logs: str) -> Config:
+        """
+        Function that returns the training strategy.
+        : str run_name: name of run.
+        : str path_to_logs: path to logs directory.
+        :return: training logger.
+        :rtype: Config.
+        """
+
+        tb_logger = Config(TensorBoardLogger, save_dir=path_to_logs)
+
+        ckpt = Config(
+            nl.ModelCheckpoint,
+            monitor="reduced_train_loss",
+            save_best_model=False,
+            save_last=False,
+            save_top_k=0,
+        )
+
+        return Config(
+            nl.NeMoLogger,
+            ckpt=ckpt,
+            name=run_name,
+            tensorboard=tb_logger,
+            wandb=None,
+            dir=path_to_logs,
+        )
 
     def get_run_config(self) -> dict:
         """Function that returns config for cluster job.
diff --git a/nemo/collections/llm/tools/auto_configurator/runner.py b/nemo/collections/llm/tools/auto_configurator/runner.py
index 9631eafb887d..3fa8bf7fd5a1 100644
--- a/nemo/collections/llm/tools/auto_configurator/runner.py
+++ b/nemo/collections/llm/tools/auto_configurator/runner.py
@@ -198,27 +198,6 @@ def _get_data(self, data_config: dict, tokenizer_config: Config) -> Config:
             tokenizer=tokenizer_config,
         )
 
-    def _get_optim(self, optim_config: Config) -> Config:
-        """
-        Function that returns the optimizer.
-        : Config optim_config: optimizer config.
-        :return: optimizer.
-        :rtype: Config.
-        """
-
-        sched = Config(
-            CosineAnnealingScheduler,
-            warmup_steps=10,
-            constant_steps=0,
-            min_lr=optim_config.min_lr,
-        )
-
-        return Config(
-            MegatronOptimizerModule,
-            config=optim_config,
-            lr_scheduler=sched,
-        )
-
     def _get_trainer(self, trainer_config: dict, strategy: Config) -> Config:
         """
         Function that returns the trainer.

From 14b95496d7f8656a9e8d1fc56b7b9f7060fda05d Mon Sep 17 00:00:00 2001
From: dimapihtar <dimapihtar@users.noreply.github.com>
Date: Tue, 3 Sep 2024 09:46:23 +0000
Subject: [PATCH 27/63] Apply isort and black reformatting

Signed-off-by: dimapihtar <dimapihtar@users.noreply.github.com>
---
 .../llm/tools/auto_configurator/base_configs/basic.py     | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/nemo/collections/llm/tools/auto_configurator/base_configs/basic.py b/nemo/collections/llm/tools/auto_configurator/base_configs/basic.py
index f6f335245cc9..1d4cb0b8a98d 100644
--- a/nemo/collections/llm/tools/auto_configurator/base_configs/basic.py
+++ b/nemo/collections/llm/tools/auto_configurator/base_configs/basic.py
@@ -14,10 +14,10 @@
 
 from megatron.core.optimizer import OptimizerConfig
 
-from nemo.lightning.pytorch.optim import CosineAnnealingScheduler, MegatronOptimizerModule
+from nemo import lightning as nl
 from nemo.collections.llm import GPTModel, PreTrainingDataModule
 from nemo.collections.llm.utils import Config
-from nemo import lightning as nl
+from nemo.lightning.pytorch.optim import CosineAnnealingScheduler, MegatronOptimizerModule
 
 
 class Basic:
@@ -107,7 +107,7 @@ def get_trainer_config(self) -> dict:
             "max_steps": self.max_steps,
             "val_check_interval": self.max_steps,
         }
-        
+
         strategy = Config(
             nl.MegatronStrategy,
             pipeline_dtype=torch.bfloat16,
@@ -157,7 +157,7 @@ def get_data(self) -> dict:
             **data_config,
             tokenizer=tokenizer_config,
         )
-    
+
     def get_logger(self, run_name: str, path_to_logs: str) -> Config:
         """
         Function that returns the training strategy.

From e1ccec1362e9f76d57aeeeeac87d95c70134fd26 Mon Sep 17 00:00:00 2001
From: dimapihtar <dpihtar@gmail.com>
Date: Tue, 3 Sep 2024 06:27:46 -0700
Subject: [PATCH 28/63] move to common basic class

Signed-off-by: dimapihtar <dpihtar@gmail.com>
---
 .../base_configs/__init__.py                  |  13 +-
 .../auto_configurator/base_configs/basic.py   |  74 +++---
 .../auto_configurator/core/training_config.py |  50 ++--
 .../llm/tools/auto_configurator/core/utils.py |  73 +++---
 .../llm/tools/auto_configurator/runner.py     | 218 ++++++------------
 5 files changed, 173 insertions(+), 255 deletions(-)

diff --git a/nemo/collections/llm/tools/auto_configurator/base_configs/__init__.py b/nemo/collections/llm/tools/auto_configurator/base_configs/__init__.py
index 632adbf1142a..cf1714ff3b43 100644
--- a/nemo/collections/llm/tools/auto_configurator/base_configs/__init__.py
+++ b/nemo/collections/llm/tools/auto_configurator/base_configs/__init__.py
@@ -12,9 +12,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from nemo.collections.llm.tools.auto_configurator.base_configs.gemma import Gemma
-from nemo.collections.llm.tools.auto_configurator.base_configs.gpt import GPT
-from nemo.collections.llm.tools.auto_configurator.base_configs.llama import Llama
-from nemo.collections.llm.tools.auto_configurator.base_configs.mistral import Mistral
-from nemo.collections.llm.tools.auto_configurator.base_configs.mixtral import Mixtral
-from nemo.collections.llm.tools.auto_configurator.base_configs.nemotron import NeMotron
+from nemo.collections.llm.tools.auto_configurator.base_configs.basic import ModelConfig
+#from nemo.collections.llm.tools.auto_configurator.base_configs.gemma import Gemma
+#from nemo.collections.llm.tools.auto_configurator.base_configs.gpt import GPT
+#from nemo.collections.llm.tools.auto_configurator.base_configs.llama import Llama
+#from nemo.collections.llm.tools.auto_configurator.base_configs.mistral import Mistral
+#from nemo.collections.llm.tools.auto_configurator.base_configs.mixtral import Mixtral
+#from nemo.collections.llm.tools.auto_configurator.base_configs.nemotron import NeMotron
diff --git a/nemo/collections/llm/tools/auto_configurator/base_configs/basic.py b/nemo/collections/llm/tools/auto_configurator/base_configs/basic.py
index 1d4cb0b8a98d..fc26dae49592 100644
--- a/nemo/collections/llm/tools/auto_configurator/base_configs/basic.py
+++ b/nemo/collections/llm/tools/auto_configurator/base_configs/basic.py
@@ -12,19 +12,26 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import torch
+from dataclasses import dataclass, field
+
 from megatron.core.optimizer import OptimizerConfig
 
-from nemo import lightning as nl
+from pytorch_lightning.loggers import TensorBoardLogger
+
+from nemo.collections.common.tokenizers import AutoTokenizer, SentencePieceTokenizer
+from nemo.lightning.pytorch.optim import CosineAnnealingScheduler, MegatronOptimizerModule
 from nemo.collections.llm import GPTModel, PreTrainingDataModule
 from nemo.collections.llm.utils import Config
-from nemo.lightning.pytorch.optim import CosineAnnealingScheduler, MegatronOptimizerModule
+from nemo.utils.exp_manager import TimingCallback
+from nemo import lightning as nl
 
 
-class Basic:
+@dataclass
+class ModelConfig:
     def __init__(
         self,
-        model: Config = None,
-        cfg: dict = {},
+        config = None,
     ):
         """
         Args:
@@ -35,21 +42,17 @@ def __init__(
             cfg (dict): auto configurator runner config.
         """
 
-        self.model = model
-        self.num_nodes = cfg.get("num_nodes")
-        self.num_gpus = cfg.get("num_gpus")
-        self.max_steps = cfg.get("max_steps_per_run")
-        self.seq_length = cfg.get("seq_length")
-        self.global_batch_size = cfg.get("global_batch_size")
-        self.data_paths = cfg.get("data_paths")
-        self.max_minutes_per_run = cfg.get("max_minutes_per_run")
+        self.config = config
 
-    def model_config(self):
+    def get_model(self):
         """Function that returns model config."""
 
-        None
+        self.config.model.global_batch_size = self.config.global_batch_size
+        self.config.model.seq_length = self.config.seq_length
+
+        return self.config.model
 
-    def get_optim_config(self) -> OptimizerConfig:
+    def get_optim(self) -> OptimizerConfig:
         """Function that returns optimizer config.
 
         Returns:
@@ -85,7 +88,7 @@ def get_optim_config(self) -> OptimizerConfig:
             lr_scheduler=sched,
         )
 
-    def get_trainer_config(self) -> dict:
+    def get_trainer(self) -> dict:
         """Function that returns config for PTL trainer.
 
         Returns:
@@ -102,10 +105,10 @@ def get_trainer_config(self) -> dict:
             "limit_test_batches": 1,
             "accumulate_grad_batches": 1,
             "gradient_clip_val": 1.0,
-            "num_nodes": self.num_nodes,
-            "devices": self.num_gpus,
-            "max_steps": self.max_steps,
-            "val_check_interval": self.max_steps,
+            "num_nodes": self.config.num_nodes,
+            "devices": self.config.num_gpus,
+            "max_steps": self.config.max_steps_per_run,
+            "val_check_interval": self.config.max_steps_per_run,
         }
 
         strategy = Config(
@@ -144,21 +147,28 @@ def get_data(self) -> dict:
             dict: data config.
         """
 
+        # Data config
         data_config = {
-            "paths": self.data_paths,
-            "seq_length": self.seq_length,
-            "global_batch_size": self.global_batch_size,
+            "paths": self.config.data_paths,
+            "seq_length": self.config.seq_length,
+            "global_batch_size": self.config.global_batch_size,
             "num_workers": 2,
             "index_mapping_dir": None,
         }
 
+        # Define the tokenizer
+        tokenizer = self.get_tokenizer(
+            self.config.tokenizer_type,
+            self.config.tokenizer_path,
+        )
+
         return Config(
             PreTrainingDataModule,
             **data_config,
-            tokenizer=tokenizer_config,
+            tokenizer=tokenizer,
         )
-
-    def get_logger(self, run_name: str, path_to_logs: str) -> Config:
+    
+    def get_logger(self) -> Config:
         """
         Function that returns the training strategy.
         : str run_name: name of run.
@@ -167,7 +177,8 @@ def get_logger(self, run_name: str, path_to_logs: str) -> Config:
         :rtype: Config.
         """
 
-        tb_logger = Config(TensorBoardLogger, save_dir=path_to_logs)
+        # Define TensorBoard Logger
+        tb_logger = Config(TensorBoardLogger, save_dir=self.config.path_to_logs)
 
         ckpt = Config(
             nl.ModelCheckpoint,
@@ -180,10 +191,9 @@ def get_logger(self, run_name: str, path_to_logs: str) -> Config:
         return Config(
             nl.NeMoLogger,
             ckpt=ckpt,
-            name=run_name,
             tensorboard=tb_logger,
             wandb=None,
-            dir=path_to_logs,
+            dir=self.config.path_to_logs,
         )
 
     def get_run_config(self) -> dict:
@@ -194,9 +204,9 @@ def get_run_config(self) -> dict:
         """
 
         run_config = {
-            "name": self.model.__class__.__name__,
+            "name": self.config.model.__class__.__name__,
             "results_dir": None,
-            "time_limit": f"0-00:{self.max_minutes_per_run}:00",
+            "time_limit": f"0-00:{self.config.max_minutes_per_run}:00",
         }
 
         return run_config
diff --git a/nemo/collections/llm/tools/auto_configurator/core/training_config.py b/nemo/collections/llm/tools/auto_configurator/core/training_config.py
index 5c83cb6c1104..06a94ce18e7b 100644
--- a/nemo/collections/llm/tools/auto_configurator/core/training_config.py
+++ b/nemo/collections/llm/tools/auto_configurator/core/training_config.py
@@ -49,10 +49,8 @@ def generate_grid_search_configs(
         dict: generated configs.
     """
 
-    model_name = train_cfg.get("model_type")
-    model_version = train_cfg.get("model_version")
-    model_size_in_b = train_cfg.get("model_size_in_b")
-    model_measure = train_cfg.get("model_measure")
+    model_name = train_cfg.model_type
+    model_size_in_b = train_cfg.model_size_in_b
 
     # 2 * num_layers is needed because of encoder/decoder architecture.
     multiplier = 1 if model_name in GPT_BASED_MODELS else 2
@@ -75,9 +73,9 @@ def generate_grid_search_configs(
         train_cfg=train_cfg,
     )
 
-    max_minutes = train_cfg.get("max_minutes_per_run")
-    max_steps = train_cfg.get("max_steps_per_run")
-    num_nodes = train_cfg.get("num_nodes")
+    max_minutes = train_cfg.max_minutes_per_run
+    max_steps = train_cfg.max_steps_per_run
+    num_nodes = train_cfg.num_nodes
 
     valid_tp_pp_list = []
     for tp in params.tp:
@@ -85,7 +83,7 @@ def generate_grid_search_configs(
             for cp in params.cp:
                 for ep in params.ep:
                     for mbs in params.mbs:
-                        num_gpus = base_cfg["trainer"]["num_nodes"] * base_cfg["trainer"]["devices"]
+                        num_gpus = base_cfg["trainer"].num_nodes * base_cfg["trainer"].devices
                         base_cfg["model"].global_batch_size = params.gbs
                         if model_name in GPT_BASED_MODELS:
                             att_heads = base_cfg["model"].num_attention_heads
@@ -127,7 +125,6 @@ def generate_grid_search_configs(
             multiplier,
             model_size_in_b,
             model_name,
-            model_measure,
         )
         for mbs in params.mbs:
             kwargs = {
@@ -168,10 +165,9 @@ def generate_grid_search_configs(
 
 
 def _set_activations_checkpoint_params(
-    tp, pp, cp, ep, num_layers, act_method, multiplier, model_size_in_b, model_name, model_measure
+    tp, pp, cp, ep, num_layers, act_method, multiplier, model_size_in_b, model_name
 ):
     act_multiple = 4 // pp
-    model_size_in_b = model_size_in_b / 1000 if model_measure == "M" else model_size_in_b
     if act_method == "block":
         if 1.0 <= model_size_in_b < 11.3:
             act_multiple = 8 // pp
@@ -238,14 +234,12 @@ class GPT3GridSearch:
         model_size_in_b (float): number of parameters in the model.
         valid_pp (List[int]): list of valid Pipeline Parallelism (PP) values for this config.
         seq length (int): sequence length to use for training.
-        model_measure (str): measure of model size (millions or billions).
         gpu_memory_gb (int): size of GPU memory in GB.
     """
 
     model_size_in_b: int
     valid_pp: List[int]
     seq_length: int
-    model_measure: str
     gpu_memory_gb: int
 
     tp = [1, 2, 4, 8]
@@ -259,7 +253,7 @@ class GPT3GridSearch:
     max_model_parallel: int = 8
 
     def init_params(self):
-        model_size_in_b = self.model_size_in_b / 1000 if self.model_measure == "M" else self.model_size_in_b
+        model_size_in_b = self.model_size_in_b
         gpu_memory_gb = self.gpu_memory_gb
         seq_length = self.seq_length
 
@@ -554,7 +548,6 @@ class T5GridSearch:
         model_size_in_b (float): number of parameters in the model.
         valid_pp (List[int]): list of valid Pipeline Parallelism (PP) values for this config.
         seq length (int): sequence length to use for training.
-        model_measure (str): measure of model size (millions or billions).
         gpu_memory_gb (int): size of GPU memory in GB.
     """
 
@@ -562,7 +555,6 @@ class T5GridSearch:
     seq_length: int
     gpu_memory_gb: int
     valid_pp: List[int]
-    model_measure: str
 
     tp = [1, 2, 4, 8]
     pp = [1]
@@ -575,7 +567,7 @@ class T5GridSearch:
     max_model_parallel: int = 8
 
     def init_params(self):
-        model_size_in_b = self.model_size_in_b / 1000 if self.model_measure == "M" else self.model_size_in_b
+        model_size_in_b = self.model_size_in_b
         gpu_memory_gb = self.gpu_memory_gb
         seq_length = self.seq_length
 
@@ -696,7 +688,6 @@ class BertGridSearch:
         model_size_in_b (float): number of parameters in the model.
         valid_pp (List[int]): list of valid Pipeline Parallelism (PP) values for this config.
         seq length (int): sequence length to use for training.
-        model_measure (str): measure of model size (millions or billions).
         gpu_memory_gb (int): size of GPU memory in GB.
     """
 
@@ -704,7 +695,6 @@ class BertGridSearch:
     seq_length: int
     gpu_memory_gb: int
     valid_pp: List[int]
-    model_measure: str
 
     tp = [1, 2, 4, 8]
     pp = [1]
@@ -717,7 +707,7 @@ class BertGridSearch:
     max_model_parallel: int = 8
 
     def init_params(self):
-        model_size_in_b = self.model_size_in_b / 1000 if self.model_measure == "M" else self.model_size_in_b
+        model_size_in_b = self.model_size_in_b
         gpu_memory_gb = self.gpu_memory_gb
         seq_length = self.seq_length
 
@@ -849,16 +839,15 @@ def _calculate_tp_pp_mbs_grid(
         NotImplementedError: if the model_name is not one of the supported models.
     """
 
-    tp_sizes = train_cfg.get("tensor_parallel_sizes")
-    pp_sizes = train_cfg.get("pipeline_parallel_sizes")
-    cp_sizes = train_cfg.get("context_parallel_sizes", None)
-    ep_sizes = train_cfg.get("expert_parallel_sizes", None)
-    min_model_parallel_size = train_cfg.get("min_model_parallel_size")
-    max_model_parallel_size = train_cfg.get("max_model_parallel_size")
-    mbs_sizes = train_cfg.get("micro_batch_sizes")
-    gbs_size = train_cfg.get("global_batch_size")
-    gpu_memory_gb = train_cfg.get("gpu_memory_gb")
-    model_measure = train_cfg.get("model_measure")
+    tp_sizes = train_cfg.tensor_parallel_sizes
+    pp_sizes = train_cfg.pipeline_parallel_sizes
+    cp_sizes = train_cfg.context_parallel_sizes
+    ep_sizes = train_cfg.expert_parallel_sizes
+    min_model_parallel_size = train_cfg.min_model_parallel_size
+    max_model_parallel_size = train_cfg.max_model_parallel_size
+    mbs_sizes = train_cfg.micro_batch_sizes
+    gbs_size = train_cfg.global_batch_size
+    gpu_memory_gb = train_cfg.gpu_memory_gb
     multiplier = 1 if model_name in GPT_BASED_MODELS else 2
     init_pp = [] if model_name in GPT_BASED_MODELS else [1]
     valid_pp = init_pp + [
@@ -869,7 +858,6 @@ def _calculate_tp_pp_mbs_grid(
         "model_size_in_b": model_size_in_b,
         "valid_pp": valid_pp,
         "seq_length": seq_length,
-        "model_measure": model_measure,
         "gpu_memory_gb": gpu_memory_gb,
     }
 
diff --git a/nemo/collections/llm/tools/auto_configurator/core/utils.py b/nemo/collections/llm/tools/auto_configurator/core/utils.py
index b110d23ee4b3..5d2a5e775b96 100644
--- a/nemo/collections/llm/tools/auto_configurator/core/utils.py
+++ b/nemo/collections/llm/tools/auto_configurator/core/utils.py
@@ -17,6 +17,7 @@
 from typing import List, Optional, Tuple
 
 from nemo.collections.llm.tools.auto_configurator import base_configs
+from nemo.collections.llm.utils import Config
 
 MODULES = {
     "gpt3": "GPT",
@@ -341,9 +342,7 @@ def _calculate_model_size(
 
 
 def generic_base_config(
-    model_name: str = "llama",
-    model_size_in_b: int = 7,
-    cfg: dict = {},
+    config = None,
 ) -> dict:
     """Generates a base config dictionary from a base config python file.
 
@@ -358,32 +357,28 @@ def generic_base_config(
 
     from nemo.collections.llm.tools.auto_configurator.core.base_config import calculate_model_size
 
-    default_model = False if model_size_in_b else True
-    custom_model = True if cfg.get("custom_model") else False
-
-    model_cls = getattr(base_configs, MODULES[model_name])
+    default_model = False if config.model_size_in_b else True
 
     model_size_in_b = calculate_model_size(
-        cfg.get("gpu_count"),
-        cfg.get("max_training_days"),
-        model_size_in_b,
-        cfg.get("tflops_per_gpu"),
-        cfg.get("num_tokens_in_b"),
-        model_name,
+        config.gpu_count,
+        config.max_training_days,
+        config.model_size_in_b,
+        config.tflops_per_gpu,
+        config.num_tokens_in_b,
+        config.model_type,
     )
 
     if default_model:
         model = model_cls(cfg=cfg)
-    elif custom_model:
-        model = base_configs.custom(name=MODULES[model_name], cfg=cfg)
     else:
-        model = model_cls(model=cfg.get("model"), cfg=cfg)
-
+        model = base_configs.ModelConfig(config)
+    #import pdb
+    #pdb.set_trace()
     base_cfg = {
-        "model": model.get_model_config(),
-        "optim": model.get_optim_config(),
-        "trainer": model.get_trainer_config(),
-        "data": model.get_data_config(),
+        "model": model.get_model(),
+        "optim": model.get_optim(),
+        "trainer": model.get_trainer(),
+        "data": model.get_data(),
         "run": model.get_run_config(),
     }
 
@@ -405,9 +400,9 @@ def generic_base_config(
             else:
                 base_cfg["model"].ffn_hidden_size = params.ffn
 
-    cfg["model_size_in_b"] = model_size_in_b
+    config.model_size_in_b = model_size_in_b
 
-    return base_cfg, cfg
+    return base_cfg, config
 
 
 def modify_cfg(
@@ -449,6 +444,19 @@ def modify_cfg(
     """
 
     new_cfg = copy.deepcopy(base_cfg)
+    if model_name in GPT_BASED_MODELS:
+        att_heads = new_cfg["model"].num_attention_heads
+        num_layers = new_cfg["model"].num_layers
+    else:
+        att_heads = new_cfg["model"].encoder.num_attention_heads
+        num_layers = new_cfg["model"].encoder.num_layers
+
+    # gbs = mbs * num_gpus * accumulate_grad_batches / (tp * pp)
+    num_gpus = new_cfg["trainer"].num_nodes * new_cfg["trainer"].devices
+    gbs = new_cfg["model"].global_batch_size
+    seq_len = new_cfg["model"].seq_length
+
+    new_cfg = dict(auto_config={}, run=new_cfg["run"])
     if act is not None:
         if model_name in GPT_BASED_MODELS:
             new_cfg["auto_config"]["activations_checkpoint_num_layers"] = act
@@ -468,7 +476,7 @@ def modify_cfg(
     new_cfg["auto_config"]["tensor_model_parallel_size"] = tp
     new_cfg["auto_config"]["pipeline_model_parallel_size"] = pp
     new_cfg["auto_config"]["micro_batch_size"] = mbs
-    new_cfg["data"]["micro_batch_size"] = mbs
+    new_cfg["auto_config"]["global_batch_size"] = gbs
 
     if cp is not None:
         new_cfg["auto_config"]["context_parallel_size"] = cp
@@ -476,27 +484,11 @@ def modify_cfg(
     if ep is not None:
         new_cfg["auto_config"]["expert_model_parallel_size"] = ep
 
-    if model_name in GPT_BASED_MODELS:
-        att_heads = new_cfg["model"].num_attention_heads
-        num_layers = new_cfg["model"].num_layers
-    else:
-        att_heads = new_cfg["model"].encoder.num_attention_heads
-        num_layers = new_cfg["model"].encoder.num_layers
-
-    # gbs = mbs * num_gpus * accumulate_grad_batches / (tp * pp)
-    num_gpus = new_cfg["trainer"]["num_nodes"] * new_cfg["trainer"]["devices"]
-    gbs = new_cfg["model"].global_batch_size
-    new_cfg["data"]["global_batch_size"] = gbs
-    seq_len = new_cfg["model"].seq_length
-
     mod_gbs = gbs % (mbs * num_gpus / (tp * pp))
     mod_att_heads = att_heads % tp
     mod_layers = num_layers % pp
     if mod_gbs == 0 and mod_att_heads == 0 and mod_layers == 0:
         # Valid config
-        new_cfg["trainer"]["num_nodes"] = num_nodes  # Necessary for short single-node test.
-        new_cfg["trainer"]["max_steps"] = max_steps
-        new_cfg["trainer"]["val_check_interval"] = max_steps
         days = max_minutes // 3600
         hours = (max_minutes % 3600) // 60
         mins = (max_minutes % 3600) % 60
@@ -507,5 +499,6 @@ def modify_cfg(
         print(
             f"Valid config: SeqLen={seq_len}, GBS={gbs}, MBS={mbs}, TP={tp}, PP={pp}, CP={cp}, EP={ep}, act_ckpt_layers={act}, num_mbs_act={num_mbs_act}, act_per_pipe={act_per_pipe}. Adding to directory."
         )
+        print(new_cfg)
         return new_cfg
     return None
diff --git a/nemo/collections/llm/tools/auto_configurator/runner.py b/nemo/collections/llm/tools/auto_configurator/runner.py
index 3fa8bf7fd5a1..ae3ff031ac1c 100644
--- a/nemo/collections/llm/tools/auto_configurator/runner.py
+++ b/nemo/collections/llm/tools/auto_configurator/runner.py
@@ -13,8 +13,10 @@
 # limitations under the License.
 
 from typing import List, Optional
-
+from dataclasses import dataclass
 import torch
+import re
+
 from pytorch_lightning.loggers import TensorBoardLogger
 
 from nemo import lightning as nl
@@ -55,10 +57,8 @@ def __init__(
         path_to_logs: Optional[str] = None,
         tokenizer_type: Optional[str] = "autotokenizer",
         tokenizer_path: Optional[str] = "GPT2BPETokenizer",
-        model_size: Optional[int] = None,
         gpus_per_node: Optional[int] = 8,
         gpu_memory_gb: Optional[int] = 80,
-        model_measure: Optional[str] = "B",
         seq_length: Optional[int] = 2048,
         global_batch_size: Optional[int] = "auto",
         tensor_parallel_sizes: Optional[List[int]] = "auto",
@@ -103,6 +103,14 @@ def __init__(
             vocab_size (Optional[int]): size of tokenizer vocabulary.
         """
 
+        # Print out the config
+        for key, value in locals().items():
+            if key != 'self':
+                setattr(self, key, value)
+        config = locals()
+        config.pop('self')
+        logging.info(self._get_message(config))
+
         model_type = self._get_model_type(model.__class__.__name__)
         assert model_type in SUPPORTED_MODELS, f"model_type must be set to one of {SUPPORTED_MODELS}."
         assert tokenizer_type in SUPPORTED_TOKENIZERS, f"tokenizer_type must be set to one of {SUPPORTED_TOKENIZERS}."
@@ -117,149 +125,11 @@ def __init__(
         ), "gpu_memory_gb can only be 40 or 80."
         assert max_minutes_per_run >= 10, "max_minutes_per_run must be an int and be at least 10 minutes."
 
-        self.config = locals()
-        self.config.pop('self')
-        self.config["model_type"] = model_type
-        self.config["model_size_in_b"] = model_size
-        self.config["gpu_count"] = gpu_count
-        self.config["num_gpus"] = gpus_per_node
-
-        # Print the config
-        logging.info(self._get_message(self.config))
-
-    def generate_configs(self) -> dict:
-        """
-        Function that returns a dictionary of Partial configs.
-        : dict config: runner config.
-        : str tokenizer_type: tokenizer type.
-        : str tokenizer_path: path to the tokenizer.
-        : str path_to_logs: path to logs directory.
-        :return: dictionary of Partial configs.
-        :rtype: dict.
-        """
-
-        # Generate base config for the given model size
-        base_cfg, train_cfg = generic_base_config(
-            model_name=self.config["model_type"],
-            model_size_in_b=self.config["model_size"],
-            cfg=self.config,
-        )
-
-        # Launch grid search for training constraints
-        configs = generate_grid_search_configs(base_cfg, train_cfg)
-
-        tokenizer_type = self.config.get("tokenizer_type")
-        tokenizer_path = self.config.get("tokenizer_path")
-        path_to_logs = self.config.get("path_to_logs")
-
-        tokenizer = self._get_tokenizer(tokenizer_type, tokenizer_path)
-        for name, config in configs.items():
-            strategy = self._get_startegy(config['auto_config'])
-            configs[name] = Partial(
-                pretrain,
-                model=self._get_model(config['model'], tokenizer),
-                trainer=self._get_trainer(config['trainer'], strategy),
-                data=self._get_data(config['data'], tokenizer),
-                optim=self._get_optim(config['optim']),
-                log=self._get_logger(name, path_to_logs),
-                resume=None,
-            )
-
-        return configs
-
-    def _get_model(self, model_config, tokenizer):
-        return GPTModel(model_config, tokenizer=tokenizer)
-
-    def _get_tokenizer(self, tokenizer_type: str, tokenizer_path: str) -> Config:
-        """
-        Function that returns the tokenizer config.
-        : str tokenizer_type: tokenizer type.
-        : str tokenizer_path: path to the tokenizer.
-        :return: tokenizer config.
-        :rtype: Config.
-        """
-
-        if tokenizer_type == "sentencepiece":
-            return Config(SentencePieceTokenizer, model_path=tokenizer_path)
-        else:
-            return Config(AutoTokenizer, pretrained_model_name=tokenizer_path)
-
-    def _get_data(self, data_config: dict, tokenizer_config: Config) -> Config:
-        """
-        Function that returns the data module.
-        : Config tokenizer: tokenizer config.
-        :return: data module.
-        :rtype: Config.
-        """
-
-        return Config(
-            PreTrainingDataModule,
-            **data_config,
-            tokenizer=tokenizer_config,
-        )
-
-    def _get_trainer(self, trainer_config: dict, strategy: Config) -> Config:
-        """
-        Function that returns the trainer.
-        : dict trainer_config: trainer config.
-        : Config strategy: training strategy.
-        :return: trainer.
-        :rtype: Config.
-        """
-
-        return Config(
-            nl.Trainer,
-            **trainer_config,
-            strategy=strategy,
-            plugins=Config(nl.MegatronMixedPrecision, precision="bf16-mixed"),
-            callbacks=[Config(TimingCallback)],
-        )
-
-    def _get_startegy(self, auto_config: dict) -> Config:
-        """
-        Function that returns the training strategy.
-        : dict auto_config: model parallelism config.
-        :return: training strategy.
-        :rtype: Config.
-        """
-
-        return Config(
-            nl.MegatronStrategy,
-            pipeline_dtype=torch.bfloat16,
-            tensor_model_parallel_size=auto_config.get('tensor_model_parallel_size', 1),
-            pipeline_model_parallel_size=auto_config.get('pipeline_model_parallel_size', 1),
-            virtual_pipeline_model_parallel_size=auto_config.get('virtual_pipeline_model_parallel_size', None),
-            context_parallel_size=auto_config.get('context_parallel_size', 1),
-            expert_model_parallel_size=auto_config.get('expert_model_parallel_size', 1),
-        )
-
-    def _get_logger(self, run_name: str, path_to_logs: str) -> Config:
-        """
-        Function that returns the training strategy.
-        : str run_name: name of run.
-        : str path_to_logs: path to logs directory.
-        :return: training logger.
-        :rtype: Config.
-        """
-
-        tb_logger = Config(TensorBoardLogger, save_dir=path_to_logs)
-
-        ckpt = Config(
-            nl.ModelCheckpoint,
-            monitor="reduced_train_loss",
-            save_best_model=False,
-            save_last=False,
-            save_top_k=0,
-        )
+        self.model_type = model_type
+        self.model_size_in_b = self._get_model_size(model.__class__.__name__)
+        self.gpu_count = gpu_count
+        self.num_gpus = gpus_per_node
 
-        return Config(
-            nl.NeMoLogger,
-            ckpt=ckpt,
-            name=run_name,
-            tensorboard=tb_logger,
-            wandb=None,
-            dir=path_to_logs,
-        )
 
     def _get_message(self, config: dict) -> str:
         """
@@ -271,7 +141,8 @@ def _get_message(self, config: dict) -> str:
 
         message = "AutoConfigurator runner config:\n"
         for key, value in config.items():
-            message += f"{key}: {value}\n"
+            if key != "self":
+                message += f"{key}: {value}\n"
 
         return message
 
@@ -290,3 +161,58 @@ def _get_model_type(self, model: str) -> str:
             return "nemotron"
         else:
             return None
+    
+    def _get_model_size(self, config_string):
+        match = re.search(r'(\d+)([BM])', config_string)
+        if match:
+            size = int(match.group(1))
+            measure = match.group(2)
+            if measure == 'B':
+                return size
+            elif measure == 'M':
+                return size / 1000  # Convert millions to billions
+        return None
+
+    # def generate_configs(self) -> dict:
+    #     """
+    #     Function that returns a dictionary of Partial configs.
+    #     : dict config: runner config.
+    #     : str tokenizer_type: tokenizer type.
+    #     : str tokenizer_path: path to the tokenizer.
+    #     : str path_to_logs: path to logs directory.
+    #     :return: dictionary of Partial configs.
+    #     :rtype: dict.
+    #     """
+
+    #     # Generate base config for the given model size
+    #     base_cfg, train_cfg = generic_base_config(
+    #         model=self.config["model"],
+    #         model_name=self.config["model_type"],
+    #         model_size_in_b=self.config["model_size"],
+    #         cfg=self.config,
+    #     )
+
+    #     # Launch grid search for training constraints
+    #     configs = generate_grid_search_configs(base_cfg, train_cfg)
+
+    #     tokenizer_type = self.config.get("tokenizer_type")
+    #     tokenizer_path = self.config.get("tokenizer_path")
+    #     path_to_logs = self.config.get("path_to_logs")
+
+    #     tokenizer = self._get_tokenizer(tokenizer_type, tokenizer_path)
+    #     for name, config in configs.items():
+    #         strategy = self._get_startegy(config['auto_config'])
+    #         configs[name] = Partial(
+    #             pretrain,
+    #             model=self._get_model(config['model'], tokenizer),
+    #             trainer=self._get_trainer(config['trainer'], strategy),
+    #             data=self._get_data(config['data'], tokenizer),
+    #             optim=self._get_optim(config['optim']),
+    #             log=self._get_logger(name, path_to_logs),
+    #             resume=None,
+    #         )
+
+    #     return configs
+
+    # def _get_model(self, model_config, tokenizer):
+    #     return GPTModel(model_config, tokenizer=tokenizer)
\ No newline at end of file

From c641b7d8dff49072b51857380f4a4bf51e6918bc Mon Sep 17 00:00:00 2001
From: dimapihtar <dimapihtar@users.noreply.github.com>
Date: Tue, 3 Sep 2024 13:30:14 +0000
Subject: [PATCH 29/63] Apply isort and black reformatting

Signed-off-by: dimapihtar <dimapihtar@users.noreply.github.com>
---
 .../auto_configurator/base_configs/__init__.py      | 13 +++++++------
 .../tools/auto_configurator/base_configs/basic.py   | 11 +++++------
 .../llm/tools/auto_configurator/core/utils.py       |  6 +++---
 .../llm/tools/auto_configurator/runner.py           | 11 +++++------
 4 files changed, 20 insertions(+), 21 deletions(-)

diff --git a/nemo/collections/llm/tools/auto_configurator/base_configs/__init__.py b/nemo/collections/llm/tools/auto_configurator/base_configs/__init__.py
index cf1714ff3b43..08eb69bb555c 100644
--- a/nemo/collections/llm/tools/auto_configurator/base_configs/__init__.py
+++ b/nemo/collections/llm/tools/auto_configurator/base_configs/__init__.py
@@ -13,9 +13,10 @@
 # limitations under the License.
 
 from nemo.collections.llm.tools.auto_configurator.base_configs.basic import ModelConfig
-#from nemo.collections.llm.tools.auto_configurator.base_configs.gemma import Gemma
-#from nemo.collections.llm.tools.auto_configurator.base_configs.gpt import GPT
-#from nemo.collections.llm.tools.auto_configurator.base_configs.llama import Llama
-#from nemo.collections.llm.tools.auto_configurator.base_configs.mistral import Mistral
-#from nemo.collections.llm.tools.auto_configurator.base_configs.mixtral import Mixtral
-#from nemo.collections.llm.tools.auto_configurator.base_configs.nemotron import NeMotron
+
+# from nemo.collections.llm.tools.auto_configurator.base_configs.gemma import Gemma
+# from nemo.collections.llm.tools.auto_configurator.base_configs.gpt import GPT
+# from nemo.collections.llm.tools.auto_configurator.base_configs.llama import Llama
+# from nemo.collections.llm.tools.auto_configurator.base_configs.mistral import Mistral
+# from nemo.collections.llm.tools.auto_configurator.base_configs.mixtral import Mixtral
+# from nemo.collections.llm.tools.auto_configurator.base_configs.nemotron import NeMotron
diff --git a/nemo/collections/llm/tools/auto_configurator/base_configs/basic.py b/nemo/collections/llm/tools/auto_configurator/base_configs/basic.py
index fc26dae49592..1e3016eeb5d9 100644
--- a/nemo/collections/llm/tools/auto_configurator/base_configs/basic.py
+++ b/nemo/collections/llm/tools/auto_configurator/base_configs/basic.py
@@ -12,26 +12,25 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import torch
 from dataclasses import dataclass, field
 
+import torch
 from megatron.core.optimizer import OptimizerConfig
-
 from pytorch_lightning.loggers import TensorBoardLogger
 
+from nemo import lightning as nl
 from nemo.collections.common.tokenizers import AutoTokenizer, SentencePieceTokenizer
-from nemo.lightning.pytorch.optim import CosineAnnealingScheduler, MegatronOptimizerModule
 from nemo.collections.llm import GPTModel, PreTrainingDataModule
 from nemo.collections.llm.utils import Config
+from nemo.lightning.pytorch.optim import CosineAnnealingScheduler, MegatronOptimizerModule
 from nemo.utils.exp_manager import TimingCallback
-from nemo import lightning as nl
 
 
 @dataclass
 class ModelConfig:
     def __init__(
         self,
-        config = None,
+        config=None,
     ):
         """
         Args:
@@ -167,7 +166,7 @@ def get_data(self) -> dict:
             **data_config,
             tokenizer=tokenizer,
         )
-    
+
     def get_logger(self) -> Config:
         """
         Function that returns the training strategy.
diff --git a/nemo/collections/llm/tools/auto_configurator/core/utils.py b/nemo/collections/llm/tools/auto_configurator/core/utils.py
index 5d2a5e775b96..dc592653a310 100644
--- a/nemo/collections/llm/tools/auto_configurator/core/utils.py
+++ b/nemo/collections/llm/tools/auto_configurator/core/utils.py
@@ -342,7 +342,7 @@ def _calculate_model_size(
 
 
 def generic_base_config(
-    config = None,
+    config=None,
 ) -> dict:
     """Generates a base config dictionary from a base config python file.
 
@@ -372,8 +372,8 @@ def generic_base_config(
         model = model_cls(cfg=cfg)
     else:
         model = base_configs.ModelConfig(config)
-    #import pdb
-    #pdb.set_trace()
+    # import pdb
+    # pdb.set_trace()
     base_cfg = {
         "model": model.get_model(),
         "optim": model.get_optim(),
diff --git a/nemo/collections/llm/tools/auto_configurator/runner.py b/nemo/collections/llm/tools/auto_configurator/runner.py
index ae3ff031ac1c..d80d095f0b90 100644
--- a/nemo/collections/llm/tools/auto_configurator/runner.py
+++ b/nemo/collections/llm/tools/auto_configurator/runner.py
@@ -12,11 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from typing import List, Optional
-from dataclasses import dataclass
-import torch
 import re
+from dataclasses import dataclass
+from typing import List, Optional
 
+import torch
 from pytorch_lightning.loggers import TensorBoardLogger
 
 from nemo import lightning as nl
@@ -130,7 +130,6 @@ def __init__(
         self.gpu_count = gpu_count
         self.num_gpus = gpus_per_node
 
-
     def _get_message(self, config: dict) -> str:
         """
         Function that returns runner config line by line.
@@ -161,7 +160,7 @@ def _get_model_type(self, model: str) -> str:
             return "nemotron"
         else:
             return None
-    
+
     def _get_model_size(self, config_string):
         match = re.search(r'(\d+)([BM])', config_string)
         if match:
@@ -215,4 +214,4 @@ def _get_model_size(self, config_string):
     #     return configs
 
     # def _get_model(self, model_config, tokenizer):
-    #     return GPTModel(model_config, tokenizer=tokenizer)
\ No newline at end of file
+    #     return GPTModel(model_config, tokenizer=tokenizer)

From 71b04202fd18a4431d74f0dbe3035392b4ef2252 Mon Sep 17 00:00:00 2001
From: dimapihtar <dpihtar@gmail.com>
Date: Tue, 3 Sep 2024 06:31:57 -0700
Subject: [PATCH 30/63] rename main config

Signed-off-by: dimapihtar <dpihtar@gmail.com>
---
 .../tools/auto_configurator/base_configs/__init__.py   | 10 ++++++++++
 .../llm/tools/auto_configurator/base_configs/basic.py  |  2 +-
 .../llm/tools/auto_configurator/core/utils.py          |  6 +++---
 3 files changed, 14 insertions(+), 4 deletions(-)

diff --git a/nemo/collections/llm/tools/auto_configurator/base_configs/__init__.py b/nemo/collections/llm/tools/auto_configurator/base_configs/__init__.py
index 08eb69bb555c..755b87a67918 100644
--- a/nemo/collections/llm/tools/auto_configurator/base_configs/__init__.py
+++ b/nemo/collections/llm/tools/auto_configurator/base_configs/__init__.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+<<<<<<< HEAD
 from nemo.collections.llm.tools.auto_configurator.base_configs.basic import ModelConfig
 
 # from nemo.collections.llm.tools.auto_configurator.base_configs.gemma import Gemma
@@ -20,3 +21,12 @@
 # from nemo.collections.llm.tools.auto_configurator.base_configs.mistral import Mistral
 # from nemo.collections.llm.tools.auto_configurator.base_configs.mixtral import Mixtral
 # from nemo.collections.llm.tools.auto_configurator.base_configs.nemotron import NeMotron
+=======
+from nemo.collections.llm.tools.auto_configurator.base_configs.basic import TrainConfig
+#from nemo.collections.llm.tools.auto_configurator.base_configs.gemma import Gemma
+#from nemo.collections.llm.tools.auto_configurator.base_configs.gpt import GPT
+#from nemo.collections.llm.tools.auto_configurator.base_configs.llama import Llama
+#from nemo.collections.llm.tools.auto_configurator.base_configs.mistral import Mistral
+#from nemo.collections.llm.tools.auto_configurator.base_configs.mixtral import Mixtral
+#from nemo.collections.llm.tools.auto_configurator.base_configs.nemotron import NeMotron
+>>>>>>> 661d435ca (rename main config)
diff --git a/nemo/collections/llm/tools/auto_configurator/base_configs/basic.py b/nemo/collections/llm/tools/auto_configurator/base_configs/basic.py
index 1e3016eeb5d9..137c0fe2020b 100644
--- a/nemo/collections/llm/tools/auto_configurator/base_configs/basic.py
+++ b/nemo/collections/llm/tools/auto_configurator/base_configs/basic.py
@@ -27,7 +27,7 @@
 
 
 @dataclass
-class ModelConfig:
+class TrainConfig:
     def __init__(
         self,
         config=None,
diff --git a/nemo/collections/llm/tools/auto_configurator/core/utils.py b/nemo/collections/llm/tools/auto_configurator/core/utils.py
index dc592653a310..bd1a97f6ad7a 100644
--- a/nemo/collections/llm/tools/auto_configurator/core/utils.py
+++ b/nemo/collections/llm/tools/auto_configurator/core/utils.py
@@ -371,9 +371,9 @@ def generic_base_config(
     if default_model:
         model = model_cls(cfg=cfg)
     else:
-        model = base_configs.ModelConfig(config)
-    # import pdb
-    # pdb.set_trace()
+        model = base_configs.TrainConfig(config)
+    #import pdb
+    #pdb.set_trace()
     base_cfg = {
         "model": model.get_model(),
         "optim": model.get_optim(),

From 410300957914b301c3984b8ddc99ae2e40ffd429 Mon Sep 17 00:00:00 2001
From: dimapihtar <dpihtar@gmail.com>
Date: Tue, 3 Sep 2024 08:13:16 -0700
Subject: [PATCH 31/63] remove base configs for models

Signed-off-by: dimapihtar <dpihtar@gmail.com>
---
 .../base_configs/__init__.py                  |  32 ---
 .../auto_configurator/base_configs/basic.py   | 211 -----------------
 .../auto_configurator/base_configs/gemma.py   |  48 ----
 .../auto_configurator/base_configs/gpt.py     |  45 ----
 .../auto_configurator/base_configs/llama.py   |  48 ----
 .../auto_configurator/base_configs/mistral.py |  46 ----
 .../auto_configurator/base_configs/mixtral.py |  46 ----
 .../base_configs/nemotron.py                  |  48 ----
 .../auto_configurator/core/base_config.py     | 214 +++++++++++++++++-
 .../llm/tools/auto_configurator/core/utils.py |  92 ++++----
 10 files changed, 256 insertions(+), 574 deletions(-)
 delete mode 100644 nemo/collections/llm/tools/auto_configurator/base_configs/__init__.py
 delete mode 100644 nemo/collections/llm/tools/auto_configurator/base_configs/basic.py
 delete mode 100644 nemo/collections/llm/tools/auto_configurator/base_configs/gemma.py
 delete mode 100644 nemo/collections/llm/tools/auto_configurator/base_configs/gpt.py
 delete mode 100644 nemo/collections/llm/tools/auto_configurator/base_configs/llama.py
 delete mode 100644 nemo/collections/llm/tools/auto_configurator/base_configs/mistral.py
 delete mode 100644 nemo/collections/llm/tools/auto_configurator/base_configs/mixtral.py
 delete mode 100644 nemo/collections/llm/tools/auto_configurator/base_configs/nemotron.py

diff --git a/nemo/collections/llm/tools/auto_configurator/base_configs/__init__.py b/nemo/collections/llm/tools/auto_configurator/base_configs/__init__.py
deleted file mode 100644
index 755b87a67918..000000000000
--- a/nemo/collections/llm/tools/auto_configurator/base_configs/__init__.py
+++ /dev/null
@@ -1,32 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-<<<<<<< HEAD
-from nemo.collections.llm.tools.auto_configurator.base_configs.basic import ModelConfig
-
-# from nemo.collections.llm.tools.auto_configurator.base_configs.gemma import Gemma
-# from nemo.collections.llm.tools.auto_configurator.base_configs.gpt import GPT
-# from nemo.collections.llm.tools.auto_configurator.base_configs.llama import Llama
-# from nemo.collections.llm.tools.auto_configurator.base_configs.mistral import Mistral
-# from nemo.collections.llm.tools.auto_configurator.base_configs.mixtral import Mixtral
-# from nemo.collections.llm.tools.auto_configurator.base_configs.nemotron import NeMotron
-=======
-from nemo.collections.llm.tools.auto_configurator.base_configs.basic import TrainConfig
-#from nemo.collections.llm.tools.auto_configurator.base_configs.gemma import Gemma
-#from nemo.collections.llm.tools.auto_configurator.base_configs.gpt import GPT
-#from nemo.collections.llm.tools.auto_configurator.base_configs.llama import Llama
-#from nemo.collections.llm.tools.auto_configurator.base_configs.mistral import Mistral
-#from nemo.collections.llm.tools.auto_configurator.base_configs.mixtral import Mixtral
-#from nemo.collections.llm.tools.auto_configurator.base_configs.nemotron import NeMotron
->>>>>>> 661d435ca (rename main config)
diff --git a/nemo/collections/llm/tools/auto_configurator/base_configs/basic.py b/nemo/collections/llm/tools/auto_configurator/base_configs/basic.py
deleted file mode 100644
index 137c0fe2020b..000000000000
--- a/nemo/collections/llm/tools/auto_configurator/base_configs/basic.py
+++ /dev/null
@@ -1,211 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from dataclasses import dataclass, field
-
-import torch
-from megatron.core.optimizer import OptimizerConfig
-from pytorch_lightning.loggers import TensorBoardLogger
-
-from nemo import lightning as nl
-from nemo.collections.common.tokenizers import AutoTokenizer, SentencePieceTokenizer
-from nemo.collections.llm import GPTModel, PreTrainingDataModule
-from nemo.collections.llm.utils import Config
-from nemo.lightning.pytorch.optim import CosineAnnealingScheduler, MegatronOptimizerModule
-from nemo.utils.exp_manager import TimingCallback
-
-
-@dataclass
-class TrainConfig:
-    def __init__(
-        self,
-        config=None,
-    ):
-        """
-        Args:
-            name (str): model name.
-            version (int): model version.
-            size (int):  model size.
-            measure (str): meausre of model size. "M" if model size in millions, "B" if in billions.
-            cfg (dict): auto configurator runner config.
-        """
-
-        self.config = config
-
-    def get_model(self):
-        """Function that returns model config."""
-
-        self.config.model.global_batch_size = self.config.global_batch_size
-        self.config.model.seq_length = self.config.seq_length
-
-        return self.config.model
-
-    def get_optim(self) -> OptimizerConfig:
-        """Function that returns optimizer config.
-
-        Returns:
-            OptimizerConfig: optimizer config.
-        """
-        optim_params = {
-            "optimizer": "adam",
-            "lr": 1e-4,
-            "min_lr": 1e-5,
-            "use_distributed_optimizer": True,
-            "bf16": True,
-            "adam_beta1": 0.9,
-            "adam_beta2": 0.95,
-            "overlap_grad_reduce": False,
-            "overlap_param_gather": True,
-        }
-
-        optim_config = Config(
-            OptimizerConfig,
-            **optim_params,
-        )
-
-        sched = Config(
-            CosineAnnealingScheduler,
-            warmup_steps=10,
-            constant_steps=0,
-            min_lr=optim_config.min_lr,
-        )
-
-        return Config(
-            MegatronOptimizerModule,
-            config=optim_config,
-            lr_scheduler=sched,
-        )
-
-    def get_trainer(self) -> dict:
-        """Function that returns config for PTL trainer.
-
-        Returns:
-            Config: trainer config.
-        """
-
-        trainer_config = {
-            "accelerator": "gpu",
-            "enable_checkpointing": False,
-            "use_distributed_sampler": False,
-            "max_epochs": None,
-            "log_every_n_steps": 1,
-            "limit_val_batches": 1,
-            "limit_test_batches": 1,
-            "accumulate_grad_batches": 1,
-            "gradient_clip_val": 1.0,
-            "num_nodes": self.config.num_nodes,
-            "devices": self.config.num_gpus,
-            "max_steps": self.config.max_steps_per_run,
-            "val_check_interval": self.config.max_steps_per_run,
-        }
-
-        strategy = Config(
-            nl.MegatronStrategy,
-            pipeline_dtype=torch.bfloat16,
-        )
-
-        return Config(
-            nl.Trainer,
-            **trainer_config,
-            strategy=strategy,
-            plugins=Config(nl.MegatronMixedPrecision, precision="bf16-mixed"),
-            callbacks=[Config(TimingCallback)],
-        )
-
-        return trainer_config
-
-    def get_tokenizer(self, tokenizer_type: str, tokenizer_path: str) -> Config:
-        """
-        Function that returns the tokenizer config.
-        : str tokenizer_type: tokenizer type.
-        : str tokenizer_path: path to the tokenizer.
-        :return: tokenizer config.
-        :rtype: Config.
-        """
-
-        if tokenizer_type == "sentencepiece":
-            return Config(SentencePieceTokenizer, model_path=tokenizer_path)
-        else:
-            return Config(AutoTokenizer, pretrained_model_name=tokenizer_path)
-
-    def get_data(self) -> dict:
-        """Function that returns dataset config.
-
-        Returns:
-            dict: data config.
-        """
-
-        # Data config
-        data_config = {
-            "paths": self.config.data_paths,
-            "seq_length": self.config.seq_length,
-            "global_batch_size": self.config.global_batch_size,
-            "num_workers": 2,
-            "index_mapping_dir": None,
-        }
-
-        # Define the tokenizer
-        tokenizer = self.get_tokenizer(
-            self.config.tokenizer_type,
-            self.config.tokenizer_path,
-        )
-
-        return Config(
-            PreTrainingDataModule,
-            **data_config,
-            tokenizer=tokenizer,
-        )
-
-    def get_logger(self) -> Config:
-        """
-        Function that returns the training strategy.
-        : str run_name: name of run.
-        : str path_to_logs: path to logs directory.
-        :return: training logger.
-        :rtype: Config.
-        """
-
-        # Define TensorBoard Logger
-        tb_logger = Config(TensorBoardLogger, save_dir=self.config.path_to_logs)
-
-        ckpt = Config(
-            nl.ModelCheckpoint,
-            monitor="reduced_train_loss",
-            save_best_model=False,
-            save_last=False,
-            save_top_k=0,
-        )
-
-        return Config(
-            nl.NeMoLogger,
-            ckpt=ckpt,
-            tensorboard=tb_logger,
-            wandb=None,
-            dir=self.config.path_to_logs,
-        )
-
-    def get_run_config(self) -> dict:
-        """Function that returns config for cluster job.
-
-        Returns:
-            dict: cluster job config.
-        """
-
-        run_config = {
-            "name": self.config.model.__class__.__name__,
-            "results_dir": None,
-            "time_limit": f"0-00:{self.config.max_minutes_per_run}:00",
-        }
-
-        return run_config
diff --git a/nemo/collections/llm/tools/auto_configurator/base_configs/gemma.py b/nemo/collections/llm/tools/auto_configurator/base_configs/gemma.py
deleted file mode 100644
index 29ef5d0dcad9..000000000000
--- a/nemo/collections/llm/tools/auto_configurator/base_configs/gemma.py
+++ /dev/null
@@ -1,48 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import torch
-
-from nemo.collections import llm
-from nemo.collections.llm.utils import Config
-
-from .basic import Basic
-
-
-class Gemma(Basic):
-    def __init__(
-        self,
-        model: Config = None,
-        cfg: dict = {},
-    ):
-        """
-        Args:
-            model (Config): model config.
-            cfg (dict): auto configurator runner config.
-        """
-
-        super().__init__(model=model, cfg=cfg)
-
-    def get_model_config(self) -> Config:
-        """Function that returns model config.
-
-        Returns:
-            Config: model config.
-        """
-
-        self.model.global_batch_size = self.global_batch_size
-        self.model.seq_length = self.seq_length
-        self.model.pipeline_dtype = torch.bfloat16
-
-        return self.model
diff --git a/nemo/collections/llm/tools/auto_configurator/base_configs/gpt.py b/nemo/collections/llm/tools/auto_configurator/base_configs/gpt.py
deleted file mode 100644
index 8942486f082c..000000000000
--- a/nemo/collections/llm/tools/auto_configurator/base_configs/gpt.py
+++ /dev/null
@@ -1,45 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from nemo.collections import llm
-from nemo.collections.llm.utils import Config
-
-from .basic import Basic
-
-
-class GPT(Basic):
-    def __init__(
-        self,
-        model: Config = None,
-        cfg: dict = {},
-    ):
-        """
-        Args:
-            name (str): model name.
-            cfg (dict): auto configurator runner config.
-        """
-
-        super().__init__(model=model, cfg=cfg)
-
-    def get_model_config(self) -> Config:
-        """Function that returns model config.
-
-        Returns:
-            Config: model config.
-        """
-
-        self.model.global_batch_size = self.global_batch_size
-        self.model.seq_length = self.seq_length
-
-        return self.model
diff --git a/nemo/collections/llm/tools/auto_configurator/base_configs/llama.py b/nemo/collections/llm/tools/auto_configurator/base_configs/llama.py
deleted file mode 100644
index 9170a641ddc9..000000000000
--- a/nemo/collections/llm/tools/auto_configurator/base_configs/llama.py
+++ /dev/null
@@ -1,48 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import torch
-
-from nemo.collections import llm
-from nemo.collections.llm.utils import Config
-
-from .basic import Basic
-
-
-class Llama(Basic):
-    def __init__(
-        self,
-        model: Config = None,
-        cfg: dict = {},
-    ):
-        """
-        Args:
-            model (Config): model config.
-            cfg (dict): auto configurator runner config.
-        """
-
-        super().__init__(model=model, cfg=cfg)
-
-    def get_model_config(self) -> Config:
-        """Function that returns model config.
-
-        Returns:
-            Config: model config.
-        """
-
-        self.model.global_batch_size = self.global_batch_size
-        self.model.seq_length = self.seq_length
-        self.model.pipeline_dtype = torch.bfloat16
-
-        return self.model
diff --git a/nemo/collections/llm/tools/auto_configurator/base_configs/mistral.py b/nemo/collections/llm/tools/auto_configurator/base_configs/mistral.py
deleted file mode 100644
index 7abcabfbf4ab..000000000000
--- a/nemo/collections/llm/tools/auto_configurator/base_configs/mistral.py
+++ /dev/null
@@ -1,46 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from nemo.collections import llm
-from nemo.collections.llm.utils import Config
-
-from .basic import Basic
-
-
-class Mistral(Basic):
-    def __init__(
-        self,
-        model: Config = None,
-        cfg: dict = {},
-    ):
-        """
-        Args:
-            model (Config): model config.
-            cfg (dict): auto configurator runner config.
-        """
-
-        super().__init__(model=model, cfg=cfg)
-
-    def get_model_config(self) -> Config:
-        """Function that returns model config.
-
-        Returns:
-            Config: model config.
-        """
-
-        self.model.global_batch_size = self.global_batch_size
-        self.model.seq_length = self.seq_length
-        self.model.pipeline_dtype = torch.bfloat16
-
-        return self.model
diff --git a/nemo/collections/llm/tools/auto_configurator/base_configs/mixtral.py b/nemo/collections/llm/tools/auto_configurator/base_configs/mixtral.py
deleted file mode 100644
index 0ea57c62b0c4..000000000000
--- a/nemo/collections/llm/tools/auto_configurator/base_configs/mixtral.py
+++ /dev/null
@@ -1,46 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from nemo.collections import llm
-from nemo.collections.llm.utils import Config
-
-from .basic import Basic
-
-
-class Mixtral(Basic):
-    def __init__(
-        self,
-        model: Config = None,
-        cfg: dict = {},
-    ):
-        """
-        Args:
-            model (Config): model config.
-            cfg (dict): auto configurator runner config.
-        """
-
-        super().__init__(model=model, cfg=cfg)
-
-    def get_model_config(self) -> Config:
-        """Function that returns model config.
-
-        Returns:
-            Config: model config.
-        """
-
-        self.model.global_batch_size = self.global_batch_size
-        self.model.seq_length = self.seq_length
-        self.model.pipeline_dtype = torch.bfloat16
-
-        return self.model
diff --git a/nemo/collections/llm/tools/auto_configurator/base_configs/nemotron.py b/nemo/collections/llm/tools/auto_configurator/base_configs/nemotron.py
deleted file mode 100644
index 766503f09b8c..000000000000
--- a/nemo/collections/llm/tools/auto_configurator/base_configs/nemotron.py
+++ /dev/null
@@ -1,48 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import torch
-
-from nemo.collections import llm
-from nemo.collections.llm.utils import Config
-
-from .basic import Basic
-
-
-class NeMotron(Basic):
-    def __init__(
-        self,
-        model: Config = None,
-        cfg: dict = {},
-    ):
-        """
-        Args:
-            model (Config): model config.
-            cfg (dict): auto configurator runner config.
-        """
-
-        super().__init__(model=model, cfg=cfg)
-
-    def get_model_config(self) -> Config:
-        """Function that returns model config.
-
-        Returns:
-            Config: model config.
-        """
-
-        self.model.global_batch_size = self.global_batch_size
-        self.model.seq_length = self.seq_length
-        self.model.pipeline_dtype = torch.bfloat16
-
-        return self.model
diff --git a/nemo/collections/llm/tools/auto_configurator/core/base_config.py b/nemo/collections/llm/tools/auto_configurator/core/base_config.py
index a0c98f07caa7..d6a1b6d17d3a 100644
--- a/nemo/collections/llm/tools/auto_configurator/core/base_config.py
+++ b/nemo/collections/llm/tools/auto_configurator/core/base_config.py
@@ -19,6 +19,218 @@
 from nemo.collections.llm.tools.auto_configurator.core.utils import generic_base_config
 
 
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from dataclasses import dataclass, field
+
+import torch
+from megatron.core.optimizer import OptimizerConfig
+from pytorch_lightning.loggers import TensorBoardLogger
+
+from nemo import lightning as nl
+from nemo.collections.common.tokenizers import AutoTokenizer, SentencePieceTokenizer
+from nemo.collections.llm import GPTModel, PreTrainingDataModule
+from nemo.collections.llm.utils import Config
+from nemo.lightning.pytorch.optim import CosineAnnealingScheduler, MegatronOptimizerModule
+from nemo.utils.exp_manager import TimingCallback
+
+
+class BaseConfig:
+    def __init__(
+        self,
+        config=None,
+    ):
+        """
+        Args:
+            name (str): model name.
+            version (int): model version.
+            size (int):  model size.
+            measure (str): meausre of model size. "M" if model size in millions, "B" if in billions.
+            cfg (dict): auto configurator runner config.
+        """
+
+        self.config = config
+
+    def get_model(self):
+        """Function that returns model config."""
+
+        self.config.model.global_batch_size = self.config.global_batch_size
+        self.config.model.seq_length = self.config.seq_length
+
+        return self.config.model
+
+    def get_optim(self) -> OptimizerConfig:
+        """Function that returns optimizer config.
+
+        Returns:
+            OptimizerConfig: optimizer config.
+        """
+        optim_params = {
+            "optimizer": "adam",
+            "lr": 1e-4,
+            "min_lr": 1e-5,
+            "use_distributed_optimizer": True,
+            "bf16": True,
+            "adam_beta1": 0.9,
+            "adam_beta2": 0.95,
+            "overlap_grad_reduce": False,
+            "overlap_param_gather": True,
+        }
+
+        optim_config = Config(
+            OptimizerConfig,
+            **optim_params,
+        )
+
+        sched = Config(
+            CosineAnnealingScheduler,
+            warmup_steps=10,
+            constant_steps=0,
+            min_lr=optim_config.min_lr,
+        )
+
+        return Config(
+            MegatronOptimizerModule,
+            config=optim_config,
+            lr_scheduler=sched,
+        )
+
+    def get_trainer(self) -> dict:
+        """Function that returns config for PTL trainer.
+
+        Returns:
+            Config: trainer config.
+        """
+
+        trainer_config = {
+            "accelerator": "gpu",
+            "enable_checkpointing": False,
+            "use_distributed_sampler": False,
+            "max_epochs": None,
+            "log_every_n_steps": 1,
+            "limit_val_batches": 1,
+            "limit_test_batches": 1,
+            "accumulate_grad_batches": 1,
+            "gradient_clip_val": 1.0,
+            "num_nodes": self.config.num_nodes,
+            "devices": self.config.num_gpus,
+            "max_steps": self.config.max_steps_per_run,
+            "val_check_interval": self.config.max_steps_per_run,
+        }
+
+        strategy = Config(
+            nl.MegatronStrategy,
+            pipeline_dtype=torch.bfloat16,
+        )
+
+        return Config(
+            nl.Trainer,
+            **trainer_config,
+            strategy=strategy,
+            plugins=Config(nl.MegatronMixedPrecision, precision="bf16-mixed"),
+            callbacks=[Config(TimingCallback)],
+        )
+
+        return trainer_config
+
+    def get_tokenizer(self, tokenizer_type: str, tokenizer_path: str) -> Config:
+        """
+        Function that returns the tokenizer config.
+        : str tokenizer_type: tokenizer type.
+        : str tokenizer_path: path to the tokenizer.
+        :return: tokenizer config.
+        :rtype: Config.
+        """
+
+        if tokenizer_type == "sentencepiece":
+            return Config(SentencePieceTokenizer, model_path=tokenizer_path)
+        else:
+            return Config(AutoTokenizer, pretrained_model_name=tokenizer_path)
+
+    def get_data(self) -> dict:
+        """Function that returns dataset config.
+
+        Returns:
+            dict: data config.
+        """
+
+        # Data config
+        data_config = {
+            "paths": self.config.data_paths,
+            "seq_length": self.config.seq_length,
+            "global_batch_size": self.config.global_batch_size,
+            "num_workers": 2,
+            "index_mapping_dir": None,
+        }
+
+        # Define the tokenizer
+        tokenizer = self.get_tokenizer(
+            self.config.tokenizer_type,
+            self.config.tokenizer_path,
+        )
+
+        return Config(
+            PreTrainingDataModule,
+            **data_config,
+            tokenizer=tokenizer,
+        )
+
+    def get_logger(self) -> Config:
+        """
+        Function that returns the training strategy.
+        : str run_name: name of run.
+        : str path_to_logs: path to logs directory.
+        :return: training logger.
+        :rtype: Config.
+        """
+
+        # Define TensorBoard Logger
+        tb_logger = Config(TensorBoardLogger, save_dir=self.config.path_to_logs)
+
+        ckpt = Config(
+            nl.ModelCheckpoint,
+            monitor="reduced_train_loss",
+            save_best_model=False,
+            save_last=False,
+            save_top_k=0,
+        )
+
+        return Config(
+            nl.NeMoLogger,
+            ckpt=ckpt,
+            tensorboard=tb_logger,
+            wandb=None,
+            dir=self.config.path_to_logs,
+        )
+
+    def get_run_config(self) -> dict:
+        """Function that returns config for cluster job.
+
+        Returns:
+            dict: cluster job config.
+        """
+
+        run_config = {
+            "name": self.config.model.__class__.__name__,
+            "results_dir": None,
+            "time_limit": f"0-00:{self.config.max_minutes_per_run}:00",
+        }
+
+        return run_config
+
+
 def calculate_model_size(
     gpu_count: int,
     max_training_days: float,
@@ -159,4 +371,4 @@ def _estimate_training_time(
         print(f"Cannot divide by zero. This can happen if gpu_count or tflops_per_gpu are zero: {err}")
     except NotImplementedError as err:
         print(f"Training time estimation is only available for {valid_models}: {err}")
-    return None
+    return None
\ No newline at end of file
diff --git a/nemo/collections/llm/tools/auto_configurator/core/utils.py b/nemo/collections/llm/tools/auto_configurator/core/utils.py
index bd1a97f6ad7a..efa61e732f64 100644
--- a/nemo/collections/llm/tools/auto_configurator/core/utils.py
+++ b/nemo/collections/llm/tools/auto_configurator/core/utils.py
@@ -16,7 +16,6 @@
 from dataclasses import dataclass
 from typing import List, Optional, Tuple
 
-from nemo.collections.llm.tools.auto_configurator import base_configs
 from nemo.collections.llm.utils import Config
 
 MODULES = {
@@ -74,7 +73,6 @@ def init_params(self):
         model_name = self.model_name
         model_size_in_b = self.model_size_in_b
         if model_name in GPT_BASED_MODELS:
-            self.ffn = 4 * self.hs
             if model_size_in_b < 0.25:
                 self.hs, self.att_h, self.lr = 768, 12, 6e-4
             elif model_size_in_b < 0.5:
@@ -194,14 +192,14 @@ def init_params(self):
         for attempt in range(0, 10):
             for layers in (2**p for p in range(1, 10)):
                 out_size = _calculate_model_size(
-                    vocab_size=vocab_size,
-                    seq_length=seq_length,
-                    hidden_size=hs,
+                    vocab_size=self.vocab_size,
+                    seq_length=self.seq_length,
+                    hidden_size=self.hs,
                     num_layers=layers,
-                    ffn_size=ffn,
-                    kv_channels=kv,
-                    att_heads=att_h,
-                    model_name=model_name,
+                    ffn_size=self.ffn,
+                    kv_channels=self.kv,
+                    att_heads=self.att_h,
+                    model_name=self.model_name,
                 )
                 if model_size_in_b * (1.0 - margin) < out_size < model_size_in_b * (1.0 + margin) and not self.layers:
                     self.layers = layers
@@ -212,14 +210,14 @@ def init_params(self):
         for attempt in range(0, 6):
             for layers in range(16, 201, 16):
                 out_size = _calculate_model_size(
-                    vocab_size=vocab_size,
-                    seq_length=seq_length,
-                    hidden_size=hs,
+                    vocab_size=self.vocab_size,
+                    seq_length=self.seq_length,
+                    hidden_size=self.hs,
                     num_layers=layers,
-                    ffn_size=ffn,
-                    kv_channels=kv,
-                    att_heads=att_h,
-                    model_name=model_name,
+                    ffn_size=self.ffn,
+                    kv_channels=self.kv,
+                    att_heads=self.att_h,
+                    model_name=self.model_name,
                 )
                 if model_size_in_b * (1.0 - margin) < out_size < model_size_in_b * (1.0 + margin) and not self.layers:
                     self.layers = layers
@@ -230,14 +228,14 @@ def init_params(self):
         for attempt in range(0, 6):
             for layers in range(2, 201, 2):
                 out_size = _calculate_model_size(
-                    vocab_size=vocab_size,
-                    seq_length=seq_length,
-                    hidden_size=hs,
+                    vocab_size=self.vocab_size,
+                    seq_length=self.seq_length,
+                    hidden_size=self.hs,
                     num_layers=layers,
-                    ffn_size=ffn,
-                    kv_channels=kv,
-                    att_heads=att_h,
-                    model_name=model_name,
+                    ffn_size=self.ffn,
+                    kv_channels=self.kv,
+                    att_heads=self.att_h,
+                    model_name=self.model_name,
                 )
                 if model_size_in_b * (1.0 - margin) < out_size < model_size_in_b * (1.0 + margin) and not self.layers:
                     self.layers = layers
@@ -248,14 +246,14 @@ def init_params(self):
         for attempt in range(0, 6):
             for layers in range(5, 201, 5):
                 out_size = _calculate_model_size(
-                    vocab_size=vocab_size,
-                    seq_length=seq_length,
-                    hidden_size=hs,
+                    vocab_size=self.vocab_size,
+                    seq_length=self.seq_length,
+                    hidden_size=self.hs,
                     num_layers=layers,
-                    ffn_size=ffn,
-                    kv_channels=kv,
-                    att_heads=att_h,
-                    model_name=model_name,
+                    ffn_size=self.ffn,
+                    kv_channels=self.kv,
+                    att_heads=self.att_h,
+                    model_name=self.model_name,
                 )
                 if model_size_in_b * (1.0 - margin) < out_size < model_size_in_b * (1.0 + margin) and not self.layers:
                     self.layers = layers
@@ -266,14 +264,14 @@ def init_params(self):
         for attempt in range(0, 10):
             for layers in range(1, 200):
                 out_size = _calculate_model_size(
-                    vocab_size=vocab_size,
-                    seq_length=seq_length,
-                    hidden_size=hs,
+                    vocab_size=self.vocab_size,
+                    seq_length=self.seq_length,
+                    hidden_size=self.hs,
                     num_layers=layers,
-                    ffn_size=ffn,
-                    kv_channels=kv,
-                    att_heads=att_h,
-                    model_name=model_name,
+                    ffn_size=self.ffn,
+                    kv_channels=self.kv,
+                    att_heads=self.att_h,
+                    model_name=self.model_name,
                 )
                 if model_size_in_b * (1.0 - margin) < out_size < model_size_in_b * (1.0 + margin) and not self.layers:
                     self.layers = layers
@@ -355,7 +353,7 @@ def generic_base_config(
         dict: dictionary containing the base configuration for the model.
     """
 
-    from nemo.collections.llm.tools.auto_configurator.core.base_config import calculate_model_size
+    from nemo.collections.llm.tools.auto_configurator.core.base_config import BaseConfig, calculate_model_size
 
     default_model = False if config.model_size_in_b else True
 
@@ -367,13 +365,8 @@ def generic_base_config(
         config.num_tokens_in_b,
         config.model_type,
     )
+    model = BaseConfig(config)
 
-    if default_model:
-        model = model_cls(cfg=cfg)
-    else:
-        model = base_configs.TrainConfig(config)
-    #import pdb
-    #pdb.set_trace()
     base_cfg = {
         "model": model.get_model(),
         "optim": model.get_optim(),
@@ -385,12 +378,13 @@ def generic_base_config(
     if default_model:
         params = ModelSizeParams(
             model_size_in_b,
-            cfg.get("vocab_size"),
-            cfg.get("seq_length"),
-            model_name,
-        ).init_params()
+            config.vocab_size,
+            config.seq_length,
+            config.model_type,
+        )
+        params.init_params()
 
-        if model_name in GPT_BASED_MODELS:
+        if config.model_type in GPT_BASED_MODELS:
             base_cfg["model"].num_layers = params.layers
             base_cfg["model"].hidden_size = params.hs
             base_cfg["model"].num_attention_heads = params.att_h

From e3793ade8cc878d494de4796a857819906392b0e Mon Sep 17 00:00:00 2001
From: dimapihtar <dimapihtar@users.noreply.github.com>
Date: Tue, 3 Sep 2024 15:14:32 +0000
Subject: [PATCH 32/63] Apply isort and black reformatting

Signed-off-by: dimapihtar <dimapihtar@users.noreply.github.com>
---
 .../auto_configurator/core/base_config.py     | 24 +++++++++----------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/nemo/collections/llm/tools/auto_configurator/core/base_config.py b/nemo/collections/llm/tools/auto_configurator/core/base_config.py
index d6a1b6d17d3a..1ff392b42c7a 100644
--- a/nemo/collections/llm/tools/auto_configurator/core/base_config.py
+++ b/nemo/collections/llm/tools/auto_configurator/core/base_config.py
@@ -14,10 +14,20 @@
 
 import math
 import os
+from dataclasses import dataclass, field
 from typing import Tuple
 
-from nemo.collections.llm.tools.auto_configurator.core.utils import generic_base_config
+import torch
+from megatron.core.optimizer import OptimizerConfig
+from pytorch_lightning.loggers import TensorBoardLogger
 
+from nemo import lightning as nl
+from nemo.collections.common.tokenizers import AutoTokenizer, SentencePieceTokenizer
+from nemo.collections.llm import GPTModel, PreTrainingDataModule
+from nemo.collections.llm.tools.auto_configurator.core.utils import generic_base_config
+from nemo.collections.llm.utils import Config
+from nemo.lightning.pytorch.optim import CosineAnnealingScheduler, MegatronOptimizerModule
+from nemo.utils.exp_manager import TimingCallback
 
 # Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
 #
@@ -33,18 +43,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from dataclasses import dataclass, field
 
-import torch
-from megatron.core.optimizer import OptimizerConfig
-from pytorch_lightning.loggers import TensorBoardLogger
 
-from nemo import lightning as nl
-from nemo.collections.common.tokenizers import AutoTokenizer, SentencePieceTokenizer
-from nemo.collections.llm import GPTModel, PreTrainingDataModule
-from nemo.collections.llm.utils import Config
-from nemo.lightning.pytorch.optim import CosineAnnealingScheduler, MegatronOptimizerModule
-from nemo.utils.exp_manager import TimingCallback
 
 
 class BaseConfig:
@@ -371,4 +371,4 @@ def _estimate_training_time(
         print(f"Cannot divide by zero. This can happen if gpu_count or tflops_per_gpu are zero: {err}")
     except NotImplementedError as err:
         print(f"Training time estimation is only available for {valid_models}: {err}")
-    return None
\ No newline at end of file
+    return None

From 58155860536d43796ab67f62b148bdee2cc8c73e Mon Sep 17 00:00:00 2001
From: artbataev <artbataev@users.noreply.github.com>
Date: Tue, 3 Sep 2024 15:15:17 +0000
Subject: [PATCH 33/63] Apply isort and black reformatting

Signed-off-by: artbataev <artbataev@users.noreply.github.com>
---
 .../llm/tools/auto_configurator/core/base_config.py            | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/nemo/collections/llm/tools/auto_configurator/core/base_config.py b/nemo/collections/llm/tools/auto_configurator/core/base_config.py
index 1ff392b42c7a..76bd0d08e4ce 100644
--- a/nemo/collections/llm/tools/auto_configurator/core/base_config.py
+++ b/nemo/collections/llm/tools/auto_configurator/core/base_config.py
@@ -44,9 +44,6 @@
 # limitations under the License.
 
 
-
-
-
 class BaseConfig:
     def __init__(
         self,

From 4d03be0f54a837b97d4e3ac40691b212d347816b Mon Sep 17 00:00:00 2001
From: dimapihtar <dpihtar@gmail.com>
Date: Tue, 3 Sep 2024 09:45:05 -0700
Subject: [PATCH 34/63] change auto conf functionality

Signed-off-by: dimapihtar <dpihtar@gmail.com>
---
 .../auto_configurator/core/base_config.py     |  9 +-
 .../auto_configurator/core/training_config.py | 28 +++---
 .../llm/tools/auto_configurator/core/utils.py | 71 +++++++--------
 .../llm/tools/auto_configurator/runner.py     | 91 ++++++++++---------
 4 files changed, 102 insertions(+), 97 deletions(-)

diff --git a/nemo/collections/llm/tools/auto_configurator/core/base_config.py b/nemo/collections/llm/tools/auto_configurator/core/base_config.py
index 76bd0d08e4ce..d7f8b5b052c6 100644
--- a/nemo/collections/llm/tools/auto_configurator/core/base_config.py
+++ b/nemo/collections/llm/tools/auto_configurator/core/base_config.py
@@ -60,6 +60,14 @@ def __init__(
 
         self.config = config
 
+        self.model = self.get_model()
+        self.optim = self.get_optim()
+        self.trainer = self.get_trainer()
+        self.data = self.get_data()
+        self.log = self.get_logger()
+        self.run = self.get_run_config()
+        self.tokenizer = self.get_tokenizer(config.tokenizer_type, config.tokenizer_path)
+
     def get_model(self):
         """Function that returns model config."""
 
@@ -227,7 +235,6 @@ def get_run_config(self) -> dict:
 
         return run_config
 
-
 def calculate_model_size(
     gpu_count: int,
     max_training_days: float,
diff --git a/nemo/collections/llm/tools/auto_configurator/core/training_config.py b/nemo/collections/llm/tools/auto_configurator/core/training_config.py
index 06a94ce18e7b..cc1ff98dd711 100644
--- a/nemo/collections/llm/tools/auto_configurator/core/training_config.py
+++ b/nemo/collections/llm/tools/auto_configurator/core/training_config.py
@@ -55,15 +55,15 @@ def generate_grid_search_configs(
     # 2 * num_layers is needed because of encoder/decoder architecture.
     multiplier = 1 if model_name in GPT_BASED_MODELS else 2
 
-    seq_length = base_cfg["model"].seq_length
+    seq_length = base_cfg.model.seq_length
     num_layers = (
-        base_cfg["model"].num_layers if model_name in GPT_BASED_MODELS else base_cfg["model"].encoder.num_layers
+        base_cfg.model.num_layers if model_name in GPT_BASED_MODELS else base_cfg.model.encoder.num_layers
     )
 
     if model_name in GPT_BASED_MODELS:
-        act_method = base_cfg["model"].activations_checkpoint_method
+        act_method = base_cfg.model.activations_checkpoint_method
     else:
-        act_method = base_cfg["model"].encoder.activations_checkpoint_method
+        act_method = base_cfg.model.encoder.activations_checkpoint_method
 
     params = _calculate_tp_pp_mbs_grid(
         model_size_in_b=model_size_in_b,
@@ -83,14 +83,14 @@ def generate_grid_search_configs(
             for cp in params.cp:
                 for ep in params.ep:
                     for mbs in params.mbs:
-                        num_gpus = base_cfg["trainer"].num_nodes * base_cfg["trainer"].devices
-                        base_cfg["model"].global_batch_size = params.gbs
+                        num_gpus = base_cfg.trainer.num_nodes * base_cfg.trainer.devices
+                        base_cfg.model.global_batch_size = params.gbs
                         if model_name in GPT_BASED_MODELS:
-                            att_heads = base_cfg["model"].num_attention_heads
-                            num_layers = base_cfg["model"].num_layers
+                            att_heads = base_cfg.model.num_attention_heads
+                            num_layers = base_cfg.model.num_layers
                         else:
-                            att_heads = base_cfg["model"].encoder.num_attention_heads
-                            num_layers = base_cfg["model"].encoder.num_layers
+                            att_heads = base_cfg.model.encoder.num_attention_heads
+                            num_layers = base_cfg.model.encoder.num_layers
                         model_parallelism = (tp * pp * cp * ep) if (cp and ep) else (tp * pp)
                         mod_gbs = params.gbs % (mbs * num_gpus / model_parallelism)
                         mod_att_heads = att_heads % tp
@@ -108,7 +108,7 @@ def generate_grid_search_configs(
                             valid_tp_pp_list.append((tp, pp, cp, ep))
 
     # Generate grid search configs.
-    configs, base_cfg["auto_config"] = {}, {}
+    configs = {}
     for tp, pp, cp, ep in valid_tp_pp_list:
         (
             virtual_pipelines,
@@ -158,10 +158,12 @@ def generate_grid_search_configs(
             else:
                 new_cfg = utils.modify_cfg(**kwargs)
                 if new_cfg:  # Save candidate cfg.
-                    configs[new_cfg["run"]["name"]] = new_cfg
+                    config_name = new_cfg["run"]["name"]
+                    new_cfg.pop("run")
+                    configs[config_name] = new_cfg
 
     print(f"\nAll candidate configurations created correctly. Total number of configs: {len(configs)}.\n")
-    return configs
+    return base_cfg, configs
 
 
 def _set_activations_checkpoint_params(
diff --git a/nemo/collections/llm/tools/auto_configurator/core/utils.py b/nemo/collections/llm/tools/auto_configurator/core/utils.py
index efa61e732f64..b1679ea93015 100644
--- a/nemo/collections/llm/tools/auto_configurator/core/utils.py
+++ b/nemo/collections/llm/tools/auto_configurator/core/utils.py
@@ -365,15 +365,7 @@ def generic_base_config(
         config.num_tokens_in_b,
         config.model_type,
     )
-    model = BaseConfig(config)
-
-    base_cfg = {
-        "model": model.get_model(),
-        "optim": model.get_optim(),
-        "trainer": model.get_trainer(),
-        "data": model.get_data(),
-        "run": model.get_run_config(),
-    }
+    base_cfg = BaseConfig(config)
 
     if default_model:
         params = ModelSizeParams(
@@ -385,14 +377,14 @@ def generic_base_config(
         params.init_params()
 
         if config.model_type in GPT_BASED_MODELS:
-            base_cfg["model"].num_layers = params.layers
-            base_cfg["model"].hidden_size = params.hs
-            base_cfg["model"].num_attention_heads = params.att_h
-            base_cfg["model"].kv_channels = params.kv
+            base_cfg.model.num_layers = params.layers
+            base_cfg.model.hidden_size = params.hs
+            base_cfg.model.num_attention_heads = params.att_h
+            base_cfg.model.kv_channels = params.kv
             if not params.ffn:
-                base_cfg["model"].ffn_hidden_size = params.hs * 4
+                base_cfg.model.ffn_hidden_size = params.hs * 4
             else:
-                base_cfg["model"].ffn_hidden_size = params.ffn
+                base_cfg.model.ffn_hidden_size = params.ffn
 
     config.model_size_in_b = model_size_in_b
 
@@ -439,60 +431,59 @@ def modify_cfg(
 
     new_cfg = copy.deepcopy(base_cfg)
     if model_name in GPT_BASED_MODELS:
-        att_heads = new_cfg["model"].num_attention_heads
-        num_layers = new_cfg["model"].num_layers
+        att_heads = new_cfg.model.num_attention_heads
+        num_layers = new_cfg.model.num_layers
     else:
-        att_heads = new_cfg["model"].encoder.num_attention_heads
-        num_layers = new_cfg["model"].encoder.num_layers
+        att_heads = new_cfg.model.encoder.num_attention_heads
+        num_layers = new_cfg.model.encoder.num_layers
 
     # gbs = mbs * num_gpus * accumulate_grad_batches / (tp * pp)
-    num_gpus = new_cfg["trainer"].num_nodes * new_cfg["trainer"].devices
-    gbs = new_cfg["model"].global_batch_size
-    seq_len = new_cfg["model"].seq_length
+    num_gpus = new_cfg.trainer.num_nodes * new_cfg.trainer.devices
+    gbs = new_cfg.model.global_batch_size
+    seq_len = new_cfg.model.seq_length
 
-    new_cfg = dict(auto_config={}, run=new_cfg["run"])
+    new_cfg = dict(run=new_cfg.run)
     if act is not None:
         if model_name in GPT_BASED_MODELS:
-            new_cfg["auto_config"]["activations_checkpoint_num_layers"] = act
+            new_cfg["activations_checkpoint_num_layers"] = act
         else:
-            new_cfg["auto_config"]["encoder"]["activations_checkpoint_num_layers"] = act // 2
-            new_cfg["auto_config"]["decoder"]["activations_checkpoint_num_layers"] = act // 2
+            new_cfg["encoder"]["activations_checkpoint_num_layers"] = act // 2
+            new_cfg["decoder"]["activations_checkpoint_num_layers"] = act // 2
 
     if num_mbs_act is not None and model_name in GPT_BASED_MODELS:
-        new_cfg["auto_config"]["num_micro_batches_with_partial_activation_checkpoints"] = num_mbs_act
+        new_cfg["num_micro_batches_with_partial_activation_checkpoints"] = num_mbs_act
 
     if act_per_pipe is not None and model_name in GPT_BASED_MODELS:
-        new_cfg["auto_config"]["activations_checkpoint_layers_per_pipeline"] = act_per_pipe
+        new_cfg["activations_checkpoint_layers_per_pipeline"] = act_per_pipe
 
     if virtual_pipelines is not None and model_name in GPT_BASED_MODELS:
-        new_cfg["auto_config"]["virtual_pipeline_model_parallel_size"] = virtual_pipelines
+        new_cfg["virtual_pipeline_model_parallel_size"] = virtual_pipelines
 
-    new_cfg["auto_config"]["tensor_model_parallel_size"] = tp
-    new_cfg["auto_config"]["pipeline_model_parallel_size"] = pp
-    new_cfg["auto_config"]["micro_batch_size"] = mbs
-    new_cfg["auto_config"]["global_batch_size"] = gbs
+    new_cfg["tensor_model_parallel_size"] = tp
+    new_cfg["pipeline_model_parallel_size"] = pp
+    new_cfg["micro_batch_size"] = mbs
+    new_cfg["global_batch_size"] = gbs
 
     if cp is not None:
-        new_cfg["auto_config"]["context_parallel_size"] = cp
+        new_cfg["context_parallel_size"] = cp
 
     if ep is not None:
-        new_cfg["auto_config"]["expert_model_parallel_size"] = ep
+        new_cfg["expert_model_parallel_size"] = ep
 
     mod_gbs = gbs % (mbs * num_gpus / (tp * pp))
     mod_att_heads = att_heads % tp
     mod_layers = num_layers % pp
     if mod_gbs == 0 and mod_att_heads == 0 and mod_layers == 0:
         # Valid config
-        days = max_minutes // 3600
-        hours = (max_minutes % 3600) // 60
-        mins = (max_minutes % 3600) % 60
-        new_cfg["run"]["time_limit"] = f"{days}-{hours}:{mins}:00"
+        #days = max_minutes // 3600
+        #hours = (max_minutes % 3600) // 60
+        #mins = (max_minutes % 3600) % 60
+        #new_cfg["run"]["time_limit"] = f"{days}-{hours}:{mins}:00"
         new_cfg["run"][
             "name"
         ] = f"{new_cfg['run']['name']}_{num_nodes}nodes_tp_{tp}_pp_{pp}_cp_{cp}_ep_{ep}_mbs_{mbs}_act_ckpt_{act}_num_mbs_act_{num_mbs_act}_act_per_pipe_{act_per_pipe}"
         print(
             f"Valid config: SeqLen={seq_len}, GBS={gbs}, MBS={mbs}, TP={tp}, PP={pp}, CP={cp}, EP={ep}, act_ckpt_layers={act}, num_mbs_act={num_mbs_act}, act_per_pipe={act_per_pipe}. Adding to directory."
         )
-        print(new_cfg)
         return new_cfg
     return None
diff --git a/nemo/collections/llm/tools/auto_configurator/runner.py b/nemo/collections/llm/tools/auto_configurator/runner.py
index d80d095f0b90..b02b6dc36e96 100644
--- a/nemo/collections/llm/tools/auto_configurator/runner.py
+++ b/nemo/collections/llm/tools/auto_configurator/runner.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 import re
+import copy
 from dataclasses import dataclass
 from typing import List, Optional
 
@@ -172,46 +173,50 @@ def _get_model_size(self, config_string):
                 return size / 1000  # Convert millions to billions
         return None
 
-    # def generate_configs(self) -> dict:
-    #     """
-    #     Function that returns a dictionary of Partial configs.
-    #     : dict config: runner config.
-    #     : str tokenizer_type: tokenizer type.
-    #     : str tokenizer_path: path to the tokenizer.
-    #     : str path_to_logs: path to logs directory.
-    #     :return: dictionary of Partial configs.
-    #     :rtype: dict.
-    #     """
-
-    #     # Generate base config for the given model size
-    #     base_cfg, train_cfg = generic_base_config(
-    #         model=self.config["model"],
-    #         model_name=self.config["model_type"],
-    #         model_size_in_b=self.config["model_size"],
-    #         cfg=self.config,
-    #     )
-
-    #     # Launch grid search for training constraints
-    #     configs = generate_grid_search_configs(base_cfg, train_cfg)
-
-    #     tokenizer_type = self.config.get("tokenizer_type")
-    #     tokenizer_path = self.config.get("tokenizer_path")
-    #     path_to_logs = self.config.get("path_to_logs")
-
-    #     tokenizer = self._get_tokenizer(tokenizer_type, tokenizer_path)
-    #     for name, config in configs.items():
-    #         strategy = self._get_startegy(config['auto_config'])
-    #         configs[name] = Partial(
-    #             pretrain,
-    #             model=self._get_model(config['model'], tokenizer),
-    #             trainer=self._get_trainer(config['trainer'], strategy),
-    #             data=self._get_data(config['data'], tokenizer),
-    #             optim=self._get_optim(config['optim']),
-    #             log=self._get_logger(name, path_to_logs),
-    #             resume=None,
-    #         )
-
-    #     return configs
-
-    # def _get_model(self, model_config, tokenizer):
-    #     return GPTModel(model_config, tokenizer=tokenizer)
+def generate_configs(config: AutoConfigurator = None) -> dict:
+    """
+    Function that returns a dictionary of Partial configs.
+    : dict config: runner config.
+    : str tokenizer_type: tokenizer type.
+    : str tokenizer_path: path to the tokenizer.
+    : str path_to_logs: path to logs directory.
+    :return: dictionary of Partial configs.
+    :rtype: dict.
+    """
+
+    # Generate base config for the given model size
+    base_cfg, train_cfg = generic_base_config(config)
+
+    # Launch grid search for training constraints
+    base_config, train_configs = generate_grid_search_configs(base_cfg, train_cfg)
+
+    tokenizer = base_config.tokenizer
+    model = GPTModel(base_config.model, tokenizer=tokenizer)
+    
+    configs = {}
+    for name, config in train_configs.items():
+        trainer = copy.deepcopy(base_config.trainer)
+        data = copy.deepcopy(base_config.data)
+
+        # Set data params
+        data.micro_batch_size = config.get("micro_batch_size")
+        data.global_batch_size = config.get("global_batch_size")
+
+        # Set strategy params
+        trainer.strategy.tensor_model_parallel_size = config.get("tensor_model_parallel_size")
+        trainer.strategy.pipeline_model_parallel_size = config.get("pipeline_model_parallel_size")
+        trainer.strategy.context_parallel_size = config.get("pipeline_model_parallel_size")
+        trainer.strategy.expert_model_parallel_size = config.get("expert_model_parallel_size")
+        trainer.strategy.virtual_pipeline_model_parallel_size = config.get("virtual_pipeline_model_parallel_size", None)
+
+        configs[name] = Partial(
+            pretrain,
+            model=model,
+            trainer=trainer,
+            data=data,
+            optim=base_config.optim,
+            log=base_config.log,
+            resume=None,
+        )
+
+    return configs
\ No newline at end of file

From f812e2b999ffc4c8e933ba3002b7f8c7acbc8379 Mon Sep 17 00:00:00 2001
From: dimapihtar <dimapihtar@users.noreply.github.com>
Date: Tue, 3 Sep 2024 16:46:16 +0000
Subject: [PATCH 35/63] Apply isort and black reformatting

Signed-off-by: dimapihtar <dimapihtar@users.noreply.github.com>
---
 .../llm/tools/auto_configurator/core/base_config.py   |  1 +
 .../tools/auto_configurator/core/training_config.py   |  4 +---
 .../llm/tools/auto_configurator/core/utils.py         |  8 ++++----
 .../collections/llm/tools/auto_configurator/runner.py | 11 +++++++----
 4 files changed, 13 insertions(+), 11 deletions(-)

diff --git a/nemo/collections/llm/tools/auto_configurator/core/base_config.py b/nemo/collections/llm/tools/auto_configurator/core/base_config.py
index d7f8b5b052c6..7314fa2c4857 100644
--- a/nemo/collections/llm/tools/auto_configurator/core/base_config.py
+++ b/nemo/collections/llm/tools/auto_configurator/core/base_config.py
@@ -235,6 +235,7 @@ def get_run_config(self) -> dict:
 
         return run_config
 
+
 def calculate_model_size(
     gpu_count: int,
     max_training_days: float,
diff --git a/nemo/collections/llm/tools/auto_configurator/core/training_config.py b/nemo/collections/llm/tools/auto_configurator/core/training_config.py
index cc1ff98dd711..7fc91f4aee34 100644
--- a/nemo/collections/llm/tools/auto_configurator/core/training_config.py
+++ b/nemo/collections/llm/tools/auto_configurator/core/training_config.py
@@ -56,9 +56,7 @@ def generate_grid_search_configs(
     multiplier = 1 if model_name in GPT_BASED_MODELS else 2
 
     seq_length = base_cfg.model.seq_length
-    num_layers = (
-        base_cfg.model.num_layers if model_name in GPT_BASED_MODELS else base_cfg.model.encoder.num_layers
-    )
+    num_layers = base_cfg.model.num_layers if model_name in GPT_BASED_MODELS else base_cfg.model.encoder.num_layers
 
     if model_name in GPT_BASED_MODELS:
         act_method = base_cfg.model.activations_checkpoint_method
diff --git a/nemo/collections/llm/tools/auto_configurator/core/utils.py b/nemo/collections/llm/tools/auto_configurator/core/utils.py
index b1679ea93015..89bc268a56f9 100644
--- a/nemo/collections/llm/tools/auto_configurator/core/utils.py
+++ b/nemo/collections/llm/tools/auto_configurator/core/utils.py
@@ -475,10 +475,10 @@ def modify_cfg(
     mod_layers = num_layers % pp
     if mod_gbs == 0 and mod_att_heads == 0 and mod_layers == 0:
         # Valid config
-        #days = max_minutes // 3600
-        #hours = (max_minutes % 3600) // 60
-        #mins = (max_minutes % 3600) % 60
-        #new_cfg["run"]["time_limit"] = f"{days}-{hours}:{mins}:00"
+        # days = max_minutes // 3600
+        # hours = (max_minutes % 3600) // 60
+        # mins = (max_minutes % 3600) % 60
+        # new_cfg["run"]["time_limit"] = f"{days}-{hours}:{mins}:00"
         new_cfg["run"][
             "name"
         ] = f"{new_cfg['run']['name']}_{num_nodes}nodes_tp_{tp}_pp_{pp}_cp_{cp}_ep_{ep}_mbs_{mbs}_act_ckpt_{act}_num_mbs_act_{num_mbs_act}_act_per_pipe_{act_per_pipe}"
diff --git a/nemo/collections/llm/tools/auto_configurator/runner.py b/nemo/collections/llm/tools/auto_configurator/runner.py
index b02b6dc36e96..837d8a98ea35 100644
--- a/nemo/collections/llm/tools/auto_configurator/runner.py
+++ b/nemo/collections/llm/tools/auto_configurator/runner.py
@@ -12,8 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import re
 import copy
+import re
 from dataclasses import dataclass
 from typing import List, Optional
 
@@ -173,6 +173,7 @@ def _get_model_size(self, config_string):
                 return size / 1000  # Convert millions to billions
         return None
 
+
 def generate_configs(config: AutoConfigurator = None) -> dict:
     """
     Function that returns a dictionary of Partial configs.
@@ -192,7 +193,7 @@ def generate_configs(config: AutoConfigurator = None) -> dict:
 
     tokenizer = base_config.tokenizer
     model = GPTModel(base_config.model, tokenizer=tokenizer)
-    
+
     configs = {}
     for name, config in train_configs.items():
         trainer = copy.deepcopy(base_config.trainer)
@@ -207,7 +208,9 @@ def generate_configs(config: AutoConfigurator = None) -> dict:
         trainer.strategy.pipeline_model_parallel_size = config.get("pipeline_model_parallel_size")
         trainer.strategy.context_parallel_size = config.get("pipeline_model_parallel_size")
         trainer.strategy.expert_model_parallel_size = config.get("expert_model_parallel_size")
-        trainer.strategy.virtual_pipeline_model_parallel_size = config.get("virtual_pipeline_model_parallel_size", None)
+        trainer.strategy.virtual_pipeline_model_parallel_size = config.get(
+            "virtual_pipeline_model_parallel_size", None
+        )
 
         configs[name] = Partial(
             pretrain,
@@ -219,4 +222,4 @@ def generate_configs(config: AutoConfigurator = None) -> dict:
             resume=None,
         )
 
-    return configs
\ No newline at end of file
+    return configs

From 97f9e6139a29529c63d85ed6374c3d1d33da02b5 Mon Sep 17 00:00:00 2001
From: dimapihtar <dpihtar@gmail.com>
Date: Wed, 4 Sep 2024 04:48:38 -0700
Subject: [PATCH 36/63] fix docstring

Signed-off-by: dimapihtar <dpihtar@gmail.com>
---
 .../auto_configurator/core/base_config.py     | 59 ++++++++---------
 .../core/calculate_performance.py             | 66 ++++++-------------
 .../auto_configurator/core/training_config.py | 11 ++--
 .../llm/tools/auto_configurator/core/utils.py | 13 ++--
 .../llm/tools/auto_configurator/runner.py     | 44 ++++++++++---
 5 files changed, 93 insertions(+), 100 deletions(-)

diff --git a/nemo/collections/llm/tools/auto_configurator/core/base_config.py b/nemo/collections/llm/tools/auto_configurator/core/base_config.py
index 7314fa2c4857..3ee0b73e949a 100644
--- a/nemo/collections/llm/tools/auto_configurator/core/base_config.py
+++ b/nemo/collections/llm/tools/auto_configurator/core/base_config.py
@@ -45,17 +45,10 @@
 
 
 class BaseConfig:
-    def __init__(
-        self,
-        config=None,
-    ):
+    def __init__(self, config=None):
         """
         Args:
-            name (str): model name.
-            version (int): model version.
-            size (int):  model size.
-            measure (str): meausre of model size. "M" if model size in millions, "B" if in billions.
-            cfg (dict): auto configurator runner config.
+            config (AutoConfigurator): auto configurator runner config.
         """
 
         self.config = config
@@ -69,18 +62,22 @@ def __init__(
         self.tokenizer = self.get_tokenizer(config.tokenizer_type, config.tokenizer_path)
 
     def get_model(self):
-        """Function that returns model config."""
+        """Function that returns model config.
+        
+        Returns:
+            Config: model config.
+        """
 
         self.config.model.global_batch_size = self.config.global_batch_size
         self.config.model.seq_length = self.config.seq_length
 
         return self.config.model
 
-    def get_optim(self) -> OptimizerConfig:
+    def get_optim(self) -> Config[OptimizerConfig]:
         """Function that returns optimizer config.
 
         Returns:
-            OptimizerConfig: optimizer config.
+            Config[OptimizerConfig]: optimizer config.
         """
         optim_params = {
             "optimizer": "adam",
@@ -112,11 +109,11 @@ def get_optim(self) -> OptimizerConfig:
             lr_scheduler=sched,
         )
 
-    def get_trainer(self) -> dict:
+    def get_trainer(self) -> Config[nl.Trainer]:
         """Function that returns config for PTL trainer.
 
         Returns:
-            Config: trainer config.
+            Config[nl.Trainer]: trainer config.
         """
 
         trainer_config = {
@@ -151,12 +148,14 @@ def get_trainer(self) -> dict:
         return trainer_config
 
     def get_tokenizer(self, tokenizer_type: str, tokenizer_path: str) -> Config:
-        """
-        Function that returns the tokenizer config.
-        : str tokenizer_type: tokenizer type.
-        : str tokenizer_path: path to the tokenizer.
-        :return: tokenizer config.
-        :rtype: Config.
+        """Function that returns the tokenizer config.
+
+        Args:
+            tokenizer_type (str): tokenizer type.
+            tokenizer_path (str): path to the tokenizer.
+
+        Returns:
+            Config: tokenizer config.
         """
 
         if tokenizer_type == "sentencepiece":
@@ -164,11 +163,11 @@ def get_tokenizer(self, tokenizer_type: str, tokenizer_path: str) -> Config:
         else:
             return Config(AutoTokenizer, pretrained_model_name=tokenizer_path)
 
-    def get_data(self) -> dict:
+    def get_data(self) -> Config[PreTrainingDataModule]:
         """Function that returns dataset config.
 
         Returns:
-            dict: data config.
+            Config[PreTrainingDataModule]: data config.
         """
 
         # Data config
@@ -192,13 +191,11 @@ def get_data(self) -> dict:
             tokenizer=tokenizer,
         )
 
-    def get_logger(self) -> Config:
-        """
-        Function that returns the training strategy.
-        : str run_name: name of run.
-        : str path_to_logs: path to logs directory.
-        :return: training logger.
-        :rtype: Config.
+    def get_logger(self) -> Config[nl.NeMoLogger]:
+        """Function that returns the training strategy.
+
+        Returns:
+            Config[nl.NeMoLogger]: NeMo Logger config.
         """
 
         # Define TensorBoard Logger
@@ -229,7 +226,6 @@ def get_run_config(self) -> dict:
 
         run_config = {
             "name": self.config.model.__class__.__name__,
-            "results_dir": None,
             "time_limit": f"0-00:{self.config.max_minutes_per_run}:00",
         }
 
@@ -257,6 +253,7 @@ def calculate_model_size(
         model_size_in_b (float): number of parameters in the model, if known.
         tflops_per_gpu (int): estimated number of TFLOPS/s per GPU.
         num_tokens_in_b (int): number of tokens to train the model for.
+        model_name (str): name of the model.
 
     Returns:
         float: number of parameters to use for training.
@@ -376,4 +373,4 @@ def _estimate_training_time(
         print(f"Cannot divide by zero. This can happen if gpu_count or tflops_per_gpu are zero: {err}")
     except NotImplementedError as err:
         print(f"Training time estimation is only available for {valid_models}: {err}")
-    return None
+    return None
\ No newline at end of file
diff --git a/nemo/collections/llm/tools/auto_configurator/core/calculate_performance.py b/nemo/collections/llm/tools/auto_configurator/core/calculate_performance.py
index a3aeabe3697e..449cd1df063c 100644
--- a/nemo/collections/llm/tools/auto_configurator/core/calculate_performance.py
+++ b/nemo/collections/llm/tools/auto_configurator/core/calculate_performance.py
@@ -24,58 +24,24 @@
 
 
 def get_results(
-    training_logs: str = None,
+    config = None,
     path_to_save: str = None,
-    model_name: str = None,
-    num_nodes: int = None,
-    model_version: int = None,
-    seq_length: int = None,
-    global_batch_size: int = None,
-    vocab_size: int = None,
-    model_size: Optional[int] = None,
-    model_measure: Optional[str] = "B",
-    gpus_per_node: Optional[int] = 8,
-    max_training_days: Optional[int] = 2,
-    tflops_per_gpu: Optional[int] = 140,
-    num_tokens_in_b: Optional[int] = 300,
-    custom_model: Optional[bool] = False,
     output_top_n: Optional[int] = 10,
 ):
-    """Generates possible train configs.
+    """Generates performance results.
 
     Args:
-        training_logs (str): path to the dicrectory with training logs.
+        config (AutoConfigurator): auto configurator runner config.
         path_to_save (str): path where to save performance results.
-        model_name (str): model name used for auto conf search.
-        num_nodes (int): number of nodes used for auto conf search.
-        model_version (int): version of model. 3 for GPT3, 2 for Llama2.
-        seq_length (int): model sequence length.
-        global_batch_size (int): model global batch size.
-        vocab_size (int): size of tokenizer vocabulary.
-        model_size (Optional[int]): size of model used for auto conf search.
-        model_measure (Optional[str]): "M" if model_size is specified in millions. "B" if in billions.
-        gpus_per_node (Optional[int]): number of GPUs per node used for auto conf search.
-        max_training_days (Optional[int]): number of days expected model to be trained.
-        tflops_per_gpu (Optional[int]): estimated tflops per GPU.
-        num_tokens_in_b (Optional[int]): number of tokens in billions in train dataset.
-        custom_model (Optional[bool]): set to True if custom model was used.
         output_top_n (Optional[int]): Number of configs to be printed out as best configs.
     """
 
     # Get model architecture
-    cfg = locals()
-    cfg["gpu_count"] = num_nodes * gpus_per_node
-    base_cfg, _ = generic_base_config(
-        model_name=model_name,
-        model_version=model_version,
-        model_size_in_b=model_size,
-        model_measure=model_measure,
-        cfg=cfg,
-    )
+    base_cfg, _ = generic_base_config(config)
 
-    layers = base_cfg["model"].num_layers
-    hs = base_cfg["model"].hidden_size
-    ffn_hs = base_cfg["model"].ffn_hidden_size
+    layers = base_cfg.model.num_layers
+    hs = base_cfg.model.hidden_size
+    ffn_hs = base_cfg.model.ffn_hidden_size
 
     training_logs = training_logs
     final_result_logs = path_to_save
@@ -315,11 +281,11 @@ def calculate_tflops(
 
 
 def find_error(error_file: str, errors: list = ["CUDA out of memory"]):
-    """Finds the error among job output.
+    """Function that finds the error among job output.
 
     Args:
-        :param list errors: list of "popular" errors.
-        :param str error_file: path to the job output.
+        errors (list): list of "popular" errors.
+        error_file (str): path to the job output.
 
     Returns:
         str: serror message if job has been failed because of one of listed errors or None if not.
@@ -334,7 +300,15 @@ def find_error(error_file: str, errors: list = ["CUDA out of memory"]):
     return error
 
 
-def get_config(run_name: str):
+def get_config(run_name: str) -> tuple:
+    """Function that extract model parallelism parameters
+    
+    Args:
+        run_name (str): name of the run.
+
+    Returns:
+        tuple: model parallelism parameters.
+    """
     pattern = r'_(tp|pp|cp|ep|mbs|act_ckpt|num_mbs_act|act_per_pipe)_([^_]+)'
 
     # Find all matches in the input string
@@ -356,4 +330,4 @@ def get_config(run_name: str):
 
 
 if __name__ == "__main__":
-    main()
+    main()
\ No newline at end of file
diff --git a/nemo/collections/llm/tools/auto_configurator/core/training_config.py b/nemo/collections/llm/tools/auto_configurator/core/training_config.py
index 7fc91f4aee34..d2e76162d559 100644
--- a/nemo/collections/llm/tools/auto_configurator/core/training_config.py
+++ b/nemo/collections/llm/tools/auto_configurator/core/training_config.py
@@ -38,7 +38,7 @@
 def generate_grid_search_configs(
     base_cfg: dict,
     train_cfg: dict,
-) -> Tuple[str, List[int], int]:
+) -> Tuple[dict, dict]:
     """Generates the grid of all possible configurations for the given model, and stores each different configuration in a yaml file.
 
     Args:
@@ -46,6 +46,7 @@ def generate_grid_search_configs(
         train_cfg (dict): train configuration of the model to be trained.
 
     Returns:
+        dict: base config.
         dict: generated configs.
     """
 
@@ -228,12 +229,12 @@ def _set_activations_checkpoint_params(
 
 @dataclass
 class GPT3GridSearch:
-    """Selects grid search space for TP, PP, MBS parameters for GPT-3 and 80GB GPUs.
+    """Selects grid search space for TP, PP, CP, EP, MBS parameters for GPT-3 and 80GB GPUs.
 
     Args:
         model_size_in_b (float): number of parameters in the model.
         valid_pp (List[int]): list of valid Pipeline Parallelism (PP) values for this config.
-        seq length (int): sequence length to use for training.
+        seq_length (int): sequence length to use for training.
         gpu_memory_gb (int): size of GPU memory in GB.
     """
 
@@ -547,7 +548,7 @@ class T5GridSearch:
     Args:
         model_size_in_b (float): number of parameters in the model.
         valid_pp (List[int]): list of valid Pipeline Parallelism (PP) values for this config.
-        seq length (int): sequence length to use for training.
+        seq_length (int): sequence length to use for training.
         gpu_memory_gb (int): size of GPU memory in GB.
     """
 
@@ -687,7 +688,7 @@ class BertGridSearch:
     Args:
         model_size_in_b (float): number of parameters in the model.
         valid_pp (List[int]): list of valid Pipeline Parallelism (PP) values for this config.
-        seq length (int): sequence length to use for training.
+        seq_length (int): sequence length to use for training.
         gpu_memory_gb (int): size of GPU memory in GB.
     """
 
diff --git a/nemo/collections/llm/tools/auto_configurator/core/utils.py b/nemo/collections/llm/tools/auto_configurator/core/utils.py
index 89bc268a56f9..f21420bda19c 100644
--- a/nemo/collections/llm/tools/auto_configurator/core/utils.py
+++ b/nemo/collections/llm/tools/auto_configurator/core/utils.py
@@ -47,8 +47,8 @@ class ModelSizeParams:
 
     Args:
         model_size_in_b (float): number of parameters in the desired model config, in billions.
-        seq_length (int): sequence length to be used during training.
         vocab_size (int): size of the vocabulary to use for training.
+        seq_length (int): sequence length to be used during training.
         model_name (str): name of the model to be trained, i.e. gpt3, t5, mt5...
 
     Raises:
@@ -339,18 +339,15 @@ def _calculate_model_size(
     return model_size
 
 
-def generic_base_config(
-    config=None,
-) -> dict:
+def generic_base_config(config) -> dict:
     """Generates a base config dictionary from a base config python file.
 
     Args:
-        model_name (str): name of the model, i.e. gpt3, t5, mt5...
-        model_size_in_b (int): model size.
-        cfg (dict): dict config object for the Auto Configurator tool.
+        config (AutoConfigurator): config object for the Auto Configurator tool.
 
     Returns:
-        dict: dictionary containing the base configuration for the model.
+        BaseConfig: base configuration for the model.
+        AutoConfigurator: config object for the Auto Configurator tool.
     """
 
     from nemo.collections.llm.tools.auto_configurator.core.base_config import BaseConfig, calculate_model_size
diff --git a/nemo/collections/llm/tools/auto_configurator/runner.py b/nemo/collections/llm/tools/auto_configurator/runner.py
index 837d8a98ea35..baed16c51e15 100644
--- a/nemo/collections/llm/tools/auto_configurator/runner.py
+++ b/nemo/collections/llm/tools/auto_configurator/runner.py
@@ -108,6 +108,7 @@ def __init__(
         for key, value in locals().items():
             if key != 'self':
                 setattr(self, key, value)
+
         config = locals()
         config.pop('self')
         logging.info(self._get_message(config))
@@ -134,9 +135,12 @@ def __init__(
     def _get_message(self, config: dict) -> str:
         """
         Function that returns runner config line by line.
-        : dict config: runner config.
-        :return: runner config params.
-        :rtype: str.
+
+        Args:
+            config (dict): runner config.
+
+        Returns:
+            str: runner config params.
         """
 
         message = "AutoConfigurator runner config:\n"
@@ -147,6 +151,16 @@ def _get_message(self, config: dict) -> str:
         return message
 
     def _get_model_type(self, model: str) -> str:
+        """
+        Function that returns model type from model class name.
+
+        Args:
+            models (str): model class name.
+
+        Returns:
+            str: model type.
+        """
+
         if "GPT" in model:
             return "gpt3"
         elif "Llama" in model:
@@ -162,7 +176,17 @@ def _get_model_type(self, model: str) -> str:
         else:
             return None
 
-    def _get_model_size(self, config_string):
+    def _get_model_size(self, config_string) -> int:
+        """
+        Function that returns model size from model class name.
+
+        Args:
+            models (str): model class name.
+
+        Returns:
+            int: model size.
+        """
+
         match = re.search(r'(\d+)([BM])', config_string)
         if match:
             size = int(match.group(1))
@@ -177,12 +201,12 @@ def _get_model_size(self, config_string):
 def generate_configs(config: AutoConfigurator = None) -> dict:
     """
     Function that returns a dictionary of Partial configs.
-    : dict config: runner config.
-    : str tokenizer_type: tokenizer type.
-    : str tokenizer_path: path to the tokenizer.
-    : str path_to_logs: path to logs directory.
-    :return: dictionary of Partial configs.
-    :rtype: dict.
+
+    Args:
+        config (AutoConfigurator): Auto Configurator object.
+    
+    Returns:
+        dict: dictionary of Partial configs.
     """
 
     # Generate base config for the given model size

From d2fed7a274f86758f2a7c5c1e54b3e13feab3dc4 Mon Sep 17 00:00:00 2001
From: dimapihtar <dimapihtar@users.noreply.github.com>
Date: Wed, 4 Sep 2024 11:49:27 +0000
Subject: [PATCH 37/63] Apply isort and black reformatting

Signed-off-by: dimapihtar <dimapihtar@users.noreply.github.com>
---
 .../llm/tools/auto_configurator/core/base_config.py         | 4 ++--
 .../tools/auto_configurator/core/calculate_performance.py   | 6 +++---
 nemo/collections/llm/tools/auto_configurator/runner.py      | 2 +-
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/nemo/collections/llm/tools/auto_configurator/core/base_config.py b/nemo/collections/llm/tools/auto_configurator/core/base_config.py
index 3ee0b73e949a..229a697d3521 100644
--- a/nemo/collections/llm/tools/auto_configurator/core/base_config.py
+++ b/nemo/collections/llm/tools/auto_configurator/core/base_config.py
@@ -63,7 +63,7 @@ def __init__(self, config=None):
 
     def get_model(self):
         """Function that returns model config.
-        
+
         Returns:
             Config: model config.
         """
@@ -373,4 +373,4 @@ def _estimate_training_time(
         print(f"Cannot divide by zero. This can happen if gpu_count or tflops_per_gpu are zero: {err}")
     except NotImplementedError as err:
         print(f"Training time estimation is only available for {valid_models}: {err}")
-    return None
\ No newline at end of file
+    return None
diff --git a/nemo/collections/llm/tools/auto_configurator/core/calculate_performance.py b/nemo/collections/llm/tools/auto_configurator/core/calculate_performance.py
index 449cd1df063c..160342176c6d 100644
--- a/nemo/collections/llm/tools/auto_configurator/core/calculate_performance.py
+++ b/nemo/collections/llm/tools/auto_configurator/core/calculate_performance.py
@@ -24,7 +24,7 @@
 
 
 def get_results(
-    config = None,
+    config=None,
     path_to_save: str = None,
     output_top_n: Optional[int] = 10,
 ):
@@ -302,7 +302,7 @@ def find_error(error_file: str, errors: list = ["CUDA out of memory"]):
 
 def get_config(run_name: str) -> tuple:
     """Function that extract model parallelism parameters
-    
+
     Args:
         run_name (str): name of the run.
 
@@ -330,4 +330,4 @@ def get_config(run_name: str) -> tuple:
 
 
 if __name__ == "__main__":
-    main()
\ No newline at end of file
+    main()
diff --git a/nemo/collections/llm/tools/auto_configurator/runner.py b/nemo/collections/llm/tools/auto_configurator/runner.py
index baed16c51e15..ef00c26bf2cc 100644
--- a/nemo/collections/llm/tools/auto_configurator/runner.py
+++ b/nemo/collections/llm/tools/auto_configurator/runner.py
@@ -204,7 +204,7 @@ def generate_configs(config: AutoConfigurator = None) -> dict:
 
     Args:
         config (AutoConfigurator): Auto Configurator object.
-    
+
     Returns:
         dict: dictionary of Partial configs.
     """

From eb9bae503e06db1545643338e6c45e996fd9537c Mon Sep 17 00:00:00 2001
From: dimapihtar <dpihtar@gmail.com>
Date: Wed, 4 Sep 2024 05:05:01 -0700
Subject: [PATCH 38/63] remove unused imports

Signed-off-by: dimapihtar <dpihtar@gmail.com>
---
 .../llm/tools/auto_configurator/runner.py            | 12 ++----------
 1 file changed, 2 insertions(+), 10 deletions(-)

diff --git a/nemo/collections/llm/tools/auto_configurator/runner.py b/nemo/collections/llm/tools/auto_configurator/runner.py
index ef00c26bf2cc..b7d502473fb4 100644
--- a/nemo/collections/llm/tools/auto_configurator/runner.py
+++ b/nemo/collections/llm/tools/auto_configurator/runner.py
@@ -14,22 +14,15 @@
 
 import copy
 import re
+
 from dataclasses import dataclass
 from typing import List, Optional
 
-import torch
-from pytorch_lightning.loggers import TensorBoardLogger
-
-from nemo import lightning as nl
-from nemo.collections.common.tokenizers import AutoTokenizer, SentencePieceTokenizer
-from nemo.collections.llm import GPTModel, PreTrainingDataModule
 from nemo.collections.llm.api import pretrain
 from nemo.collections.llm.tools.auto_configurator.core.training_config import generate_grid_search_configs
 from nemo.collections.llm.tools.auto_configurator.core.utils import generic_base_config
 from nemo.collections.llm.utils import Config, Partial
-from nemo.lightning.pytorch.optim import CosineAnnealingScheduler, MegatronOptimizerModule
 from nemo.utils import logging
-from nemo.utils.exp_manager import TimingCallback
 
 SUPPORTED_MODELS = [
     "gpt3",
@@ -164,7 +157,7 @@ def _get_model_type(self, model: str) -> str:
         if "GPT" in model:
             return "gpt3"
         elif "Llama" in model:
-            return "Llama"
+            return "llama"
         elif "Mixtral" in model:
             return "mixtral"
         elif "Mistral" in model:
@@ -186,7 +179,6 @@ def _get_model_size(self, config_string) -> int:
         Returns:
             int: model size.
         """
-
         match = re.search(r'(\d+)([BM])', config_string)
         if match:
             size = int(match.group(1))

From 4606ef3851f9dba3b9eda47bbf9783f8c97640a4 Mon Sep 17 00:00:00 2001
From: dimapihtar <dpihtar@gmail.com>
Date: Wed, 4 Sep 2024 05:22:19 -0700
Subject: [PATCH 39/63] add changes

Signed-off-by: dimapihtar <dpihtar@gmail.com>
---
 .../llm/tools/auto_configurator/core/utils.py | 25 ++++++-------------
 .../llm/tools/auto_configurator/runner.py     |  1 +
 2 files changed, 9 insertions(+), 17 deletions(-)

diff --git a/nemo/collections/llm/tools/auto_configurator/core/utils.py b/nemo/collections/llm/tools/auto_configurator/core/utils.py
index f21420bda19c..6abe0715913e 100644
--- a/nemo/collections/llm/tools/auto_configurator/core/utils.py
+++ b/nemo/collections/llm/tools/auto_configurator/core/utils.py
@@ -18,14 +18,6 @@
 
 from nemo.collections.llm.utils import Config
 
-MODULES = {
-    "gpt3": "GPT",
-    "llama": "Llama",
-    "mixtral": "Mixtral",
-    "mistral": "Mistral",
-    "gemma": "Gemma",
-    "nemotron": "NeMotron",
-}
 
 GPT_BASED_MODELS = [
     "gpt3",
@@ -426,20 +418,19 @@ def modify_cfg(
         dict: dictionary containing the updated model configuration parameters.
     """
 
-    new_cfg = copy.deepcopy(base_cfg)
     if model_name in GPT_BASED_MODELS:
-        att_heads = new_cfg.model.num_attention_heads
-        num_layers = new_cfg.model.num_layers
+        att_heads = base_cfg.model.num_attention_heads
+        num_layers = base_cfg.model.num_layers
     else:
-        att_heads = new_cfg.model.encoder.num_attention_heads
-        num_layers = new_cfg.model.encoder.num_layers
+        att_heads = base_cfg.model.encoder.num_attention_heads
+        num_layers = base_cfg.model.encoder.num_layers
 
     # gbs = mbs * num_gpus * accumulate_grad_batches / (tp * pp)
-    num_gpus = new_cfg.trainer.num_nodes * new_cfg.trainer.devices
-    gbs = new_cfg.model.global_batch_size
-    seq_len = new_cfg.model.seq_length
+    num_gpus = base_cfg.trainer.num_nodes * base_cfg.trainer.devices
+    gbs = base_cfg.model.global_batch_size
+    seq_len = base_cfg.model.seq_length
 
-    new_cfg = dict(run=new_cfg.run)
+    new_cfg = dict(run=base_cfg.run)
     if act is not None:
         if model_name in GPT_BASED_MODELS:
             new_cfg["activations_checkpoint_num_layers"] = act
diff --git a/nemo/collections/llm/tools/auto_configurator/runner.py b/nemo/collections/llm/tools/auto_configurator/runner.py
index b7d502473fb4..57b82a4edbf4 100644
--- a/nemo/collections/llm/tools/auto_configurator/runner.py
+++ b/nemo/collections/llm/tools/auto_configurator/runner.py
@@ -18,6 +18,7 @@
 from dataclasses import dataclass
 from typing import List, Optional
 
+from nemo.collections.llm import GPTModel
 from nemo.collections.llm.api import pretrain
 from nemo.collections.llm.tools.auto_configurator.core.training_config import generate_grid_search_configs
 from nemo.collections.llm.tools.auto_configurator.core.utils import generic_base_config

From a4e81283acee14790d8596e3558589d739e484c4 Mon Sep 17 00:00:00 2001
From: dimapihtar <dpihtar@gmail.com>
Date: Wed, 4 Sep 2024 05:28:12 -0700
Subject: [PATCH 40/63] remove activations_checkpoint_num_layers

Signed-off-by: dimapihtar <dpihtar@gmail.com>
---
 nemo/collections/llm/gpt/model/base.py                        | 1 -
 .../llm/tools/auto_configurator/core/training_config.py       | 2 +-
 nemo/collections/llm/tools/auto_configurator/core/utils.py    | 4 ----
 3 files changed, 1 insertion(+), 6 deletions(-)

diff --git a/nemo/collections/llm/gpt/model/base.py b/nemo/collections/llm/gpt/model/base.py
index 5d5ae94c076f..fde1837e91a4 100644
--- a/nemo/collections/llm/gpt/model/base.py
+++ b/nemo/collections/llm/gpt/model/base.py
@@ -115,7 +115,6 @@ class GPTConfig(TransformerConfig, io.IOMixin):
     masked_softmax_fusion: bool = True
     deallocate_pipeline_outputs = True
     global_batch_size: Optional[int] = 256
-    activations_checkpoint_method: Optional[int] = None
 
     transformer_layer_spec: Union[ModuleSpec, Callable[["GPTConfig"], ModuleSpec]] = default_layer_spec
     forward_step_fn: Callable = gpt_forward_step
diff --git a/nemo/collections/llm/tools/auto_configurator/core/training_config.py b/nemo/collections/llm/tools/auto_configurator/core/training_config.py
index d2e76162d559..5f3bd7f0e253 100644
--- a/nemo/collections/llm/tools/auto_configurator/core/training_config.py
+++ b/nemo/collections/llm/tools/auto_configurator/core/training_config.py
@@ -60,7 +60,7 @@ def generate_grid_search_configs(
     num_layers = base_cfg.model.num_layers if model_name in GPT_BASED_MODELS else base_cfg.model.encoder.num_layers
 
     if model_name in GPT_BASED_MODELS:
-        act_method = base_cfg.model.activations_checkpoint_method
+        act_method = None
     else:
         act_method = base_cfg.model.encoder.activations_checkpoint_method
 
diff --git a/nemo/collections/llm/tools/auto_configurator/core/utils.py b/nemo/collections/llm/tools/auto_configurator/core/utils.py
index 6abe0715913e..b2b61d0931bc 100644
--- a/nemo/collections/llm/tools/auto_configurator/core/utils.py
+++ b/nemo/collections/llm/tools/auto_configurator/core/utils.py
@@ -463,10 +463,6 @@ def modify_cfg(
     mod_layers = num_layers % pp
     if mod_gbs == 0 and mod_att_heads == 0 and mod_layers == 0:
         # Valid config
-        # days = max_minutes // 3600
-        # hours = (max_minutes % 3600) // 60
-        # mins = (max_minutes % 3600) % 60
-        # new_cfg["run"]["time_limit"] = f"{days}-{hours}:{mins}:00"
         new_cfg["run"][
             "name"
         ] = f"{new_cfg['run']['name']}_{num_nodes}nodes_tp_{tp}_pp_{pp}_cp_{cp}_ep_{ep}_mbs_{mbs}_act_ckpt_{act}_num_mbs_act_{num_mbs_act}_act_per_pipe_{act_per_pipe}"

From b853a83d7d3a0439ab1cdb9451cb575cf9fdc3f5 Mon Sep 17 00:00:00 2001
From: dimapihtar <dpihtar@gmail.com>
Date: Wed, 4 Sep 2024 07:57:42 -0700
Subject: [PATCH 41/63] remove gbs from config

Signed-off-by: dimapihtar <dpihtar@gmail.com>
---
 nemo/collections/llm/gpt/model/base.py                          | 1 -
 .../collections/llm/tools/auto_configurator/core/base_config.py | 1 -
 .../llm/tools/auto_configurator/core/training_config.py         | 2 +-
 nemo/collections/llm/tools/auto_configurator/core/utils.py      | 2 +-
 4 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/nemo/collections/llm/gpt/model/base.py b/nemo/collections/llm/gpt/model/base.py
index fde1837e91a4..9ffc77923c7b 100644
--- a/nemo/collections/llm/gpt/model/base.py
+++ b/nemo/collections/llm/gpt/model/base.py
@@ -114,7 +114,6 @@ class GPTConfig(TransformerConfig, io.IOMixin):
     attention_softmax_in_fp32: bool = False
     masked_softmax_fusion: bool = True
     deallocate_pipeline_outputs = True
-    global_batch_size: Optional[int] = 256
 
     transformer_layer_spec: Union[ModuleSpec, Callable[["GPTConfig"], ModuleSpec]] = default_layer_spec
     forward_step_fn: Callable = gpt_forward_step
diff --git a/nemo/collections/llm/tools/auto_configurator/core/base_config.py b/nemo/collections/llm/tools/auto_configurator/core/base_config.py
index 229a697d3521..78e376c7bf36 100644
--- a/nemo/collections/llm/tools/auto_configurator/core/base_config.py
+++ b/nemo/collections/llm/tools/auto_configurator/core/base_config.py
@@ -68,7 +68,6 @@ def get_model(self):
             Config: model config.
         """
 
-        self.config.model.global_batch_size = self.config.global_batch_size
         self.config.model.seq_length = self.config.seq_length
 
         return self.config.model
diff --git a/nemo/collections/llm/tools/auto_configurator/core/training_config.py b/nemo/collections/llm/tools/auto_configurator/core/training_config.py
index 5f3bd7f0e253..2d3ab169f5b5 100644
--- a/nemo/collections/llm/tools/auto_configurator/core/training_config.py
+++ b/nemo/collections/llm/tools/auto_configurator/core/training_config.py
@@ -83,7 +83,7 @@ def generate_grid_search_configs(
                 for ep in params.ep:
                     for mbs in params.mbs:
                         num_gpus = base_cfg.trainer.num_nodes * base_cfg.trainer.devices
-                        base_cfg.model.global_batch_size = params.gbs
+                        base_cfg.data.global_batch_size = params.gbs
                         if model_name in GPT_BASED_MODELS:
                             att_heads = base_cfg.model.num_attention_heads
                             num_layers = base_cfg.model.num_layers
diff --git a/nemo/collections/llm/tools/auto_configurator/core/utils.py b/nemo/collections/llm/tools/auto_configurator/core/utils.py
index b2b61d0931bc..cd3d9bdc7bac 100644
--- a/nemo/collections/llm/tools/auto_configurator/core/utils.py
+++ b/nemo/collections/llm/tools/auto_configurator/core/utils.py
@@ -427,7 +427,7 @@ def modify_cfg(
 
     # gbs = mbs * num_gpus * accumulate_grad_batches / (tp * pp)
     num_gpus = base_cfg.trainer.num_nodes * base_cfg.trainer.devices
-    gbs = base_cfg.model.global_batch_size
+    gbs = base_cfg.data.global_batch_size
     seq_len = base_cfg.model.seq_length
 
     new_cfg = dict(run=base_cfg.run)

From 7040056936dbc799c5bb9b3d0453e0f656c99b1a Mon Sep 17 00:00:00 2001
From: dimapihtar <dpihtar@gmail.com>
Date: Wed, 4 Sep 2024 08:32:04 -0700
Subject: [PATCH 42/63] fix logs

Signed-off-by: dimapihtar <dpihtar@gmail.com>
---
 examples/llm/auto_configurator/auto_config.py | 107 ++++--------------
 .../llm/tools/auto_configurator/runner.py     |   6 +
 2 files changed, 25 insertions(+), 88 deletions(-)

diff --git a/examples/llm/auto_configurator/auto_config.py b/examples/llm/auto_configurator/auto_config.py
index 071f193a6cd5..2ae3e8670508 100644
--- a/examples/llm/auto_configurator/auto_config.py
+++ b/examples/llm/auto_configurator/auto_config.py
@@ -15,42 +15,28 @@
 import argparse
 import os
 import shutil
+import fiddle as fdl
 
-import torch
-from pytorch_lightning.loggers import TensorBoardLogger
-
-from nemo import lightning as nl
-from nemo.collections import llm
-from nemo.collections.llm.api import train
-from nemo.collections.llm.gpt.data import PreTrainingDataModule
 from nemo.collections.llm.tools.auto_configurator import AutoConfigurator, get_results
-from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer
-from nemo.lightning import NeMoLogger
-from nemo.lightning.pytorch.callbacks import ModelCheckpoint
-from nemo.lightning.pytorch.optim.megatron import MegatronOptimizerModule
-from nemo.utils.exp_manager import TimingCallback
-
+from nemo.collections.llm.tools.auto_configurator.runner import generate_configs
+from nemo.collections.llm import GPTConfig126M
 
 def get_args():
     parser = argparse.ArgumentParser()
-    parser.add_argument("--run_number", type=int, help="Number of config to run")
-    parser.add_argument("--logs_dir", type=str, help="Path where to save training logs")
-    parser.add_argument("--data_path", type=str, help="Path to the dataset")
-    parser.add_argument("--get_results", action="store_true")
+    #parser.add_argument("--run_number", type=int, help="Number of config to run")
+    #parser.add_argument("--logs_dir", type=str, help="Path where to save training logs")
+    #parser.add_argument("--data_path", type=str, help="Path to the dataset")
+    #parser.add_argument("--get_results", action="store_true")
 
     return parser.parse_args()
 
-
 def train_config(args):
     # GPT-3 126M
     # This example will generate 3 configs.
     # It is expected that this script will be run 3 times with changing --run_number flag for each run from 0 to 2.
     # After all configurations are trained, please trigger the script using --get_results flag.
     runner = AutoConfigurator(
-        model_type="gpt3",
-        model_version=3,
-        model_size=126,
-        model_measure="M",
+        model=GPTConfig126M(),
         num_nodes=1,
         gpus_per_node=1,
         gpu_memory_gb=40,
@@ -63,80 +49,25 @@ def train_config(args):
         max_steps_per_run=25,
         num_tokens_in_b=10,
         vocab_size=51200,
-        data_paths=args.data_path,
+        tokenizer_path="/home/models/gpt2",
+        data_paths=["/home/data/test_text_document"],
+        path_to_logs="/home/scripts/test_autoconf",
+        #data_paths=args.data_path,
     )
 
     # Get generated configs
-    configs = runner.generate_configs()
-
-    tokenizer = get_nmt_tokenizer(
-        "megatron",
-        "GPT2BPETokenizer",
-    )
-
-    # Define candidate to run
-    runs = list(configs.keys())
-    run_name = runs[args.run_number]
-    config = configs[run_name]
-
-    # Define data
-    config["data"].pop('split')
-    data = PreTrainingDataModule(**config["data"], split="900,50,50", tokenizer=tokenizer)
-
-    # Define model
-    config["model"].tensor_model_parallel_size = config["auto_config"].get("tensor_model_parallel_size")
-    config["model"].pipeline_model_parallel_size = config["auto_config"].get("pipeline_model_parallel_size")
-    config["model"].context_parallel_size = config["auto_config"].get("context_parallel_size")
-    config["model"].expert_model_parallel_size = config["auto_config"].get("expert_model_parallel_size")
-    model = llm.GPTModel(config["model"], tokenizer=data.tokenizer)
-
-    # Define optimizer
-    opt = MegatronOptimizerModule(config=config["optim"])
-
-    # Define strategy
-    strategy = nl.MegatronStrategy(
-        pipeline_dtype=torch.bfloat16,
-        tensor_model_parallel_size=config["auto_config"].get("tensor_model_parallel_size"),
-        pipeline_model_parallel_size=config["auto_config"].get("pipeline_model_parallel_size"),
-        virtual_pipeline_model_parallel_size=config["auto_config"].get("virtual_pipeline_model_parallel_size", None),
-        context_parallel_size=config["auto_config"].get("context_parallel_size"),
-        expert_model_parallel_size=config["auto_config"].get("expert_model_parallel_size"),
-    )
-
-    # Define TensorBoard logger
-    tensorboard_logger = TensorBoardLogger(
-        save_dir=f"{args.logs_dir}/{run_name}",
-    )
-
-    # Define trainer
-    trainer = nl.Trainer(
-        **config["trainer"],
-        strategy=strategy,
-        plugins=nl.MegatronMixedPrecision(precision="bf16-mixed"),
-        callbacks=[TimingCallback()],
-        logger=tensorboard_logger,
-    )
-
-    # Define logger
-    nemo_logger = NeMoLogger(
-        dir=f"{args.logs_dir}/{run_name}",
-    )
-
-    # Train candidate
-    train(
-        model=model,
-        data=data,
-        trainer=trainer,
-        log=nemo_logger,
-        tokenizer="data",
-        optim=opt,
-    )
+    configs = generate_configs(runner)
 
+    #for name, config in configs.items():
+        #print(config)
+    cfgs = list(configs.values())
+    pretrain = fdl.build(cfgs[0])
+    pretrain()
 
 def main():
     args = get_args()
 
-    if not args.get_results:
+    if True:
         train_config(args)
 
     else:
diff --git a/nemo/collections/llm/tools/auto_configurator/runner.py b/nemo/collections/llm/tools/auto_configurator/runner.py
index 57b82a4edbf4..7253fec962ed 100644
--- a/nemo/collections/llm/tools/auto_configurator/runner.py
+++ b/nemo/collections/llm/tools/auto_configurator/runner.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 import copy
+import os
 import re
 
 from dataclasses import dataclass
@@ -215,6 +216,7 @@ def generate_configs(config: AutoConfigurator = None) -> dict:
     for name, config in train_configs.items():
         trainer = copy.deepcopy(base_config.trainer)
         data = copy.deepcopy(base_config.data)
+        log = copy.deepcopy(base_config.log)
 
         # Set data params
         data.micro_batch_size = config.get("micro_batch_size")
@@ -229,6 +231,10 @@ def generate_configs(config: AutoConfigurator = None) -> dict:
             "virtual_pipeline_model_parallel_size", None
         )
 
+        # Set the directory where to save the logs
+        log.dir = os.path.join(log.dir, name)
+        log.tensorboard.save_dir = os.path.join(log.dir, name)
+
         configs[name] = Partial(
             pretrain,
             model=model,

From ae744ae945cd84c86f8c83a9fb8be5f523c47b38 Mon Sep 17 00:00:00 2001
From: dimapihtar <dimapihtar@users.noreply.github.com>
Date: Wed, 4 Sep 2024 15:32:55 +0000
Subject: [PATCH 43/63] Apply isort and black reformatting

Signed-off-by: dimapihtar <dimapihtar@users.noreply.github.com>
---
 examples/llm/auto_configurator/auto_config.py | 20 +++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

diff --git a/examples/llm/auto_configurator/auto_config.py b/examples/llm/auto_configurator/auto_config.py
index 2ae3e8670508..f09b3b4215c1 100644
--- a/examples/llm/auto_configurator/auto_config.py
+++ b/examples/llm/auto_configurator/auto_config.py
@@ -15,21 +15,24 @@
 import argparse
 import os
 import shutil
+
 import fiddle as fdl
 
+from nemo.collections.llm import GPTConfig126M
 from nemo.collections.llm.tools.auto_configurator import AutoConfigurator, get_results
 from nemo.collections.llm.tools.auto_configurator.runner import generate_configs
-from nemo.collections.llm import GPTConfig126M
+
 
 def get_args():
     parser = argparse.ArgumentParser()
-    #parser.add_argument("--run_number", type=int, help="Number of config to run")
-    #parser.add_argument("--logs_dir", type=str, help="Path where to save training logs")
-    #parser.add_argument("--data_path", type=str, help="Path to the dataset")
-    #parser.add_argument("--get_results", action="store_true")
+    # parser.add_argument("--run_number", type=int, help="Number of config to run")
+    # parser.add_argument("--logs_dir", type=str, help="Path where to save training logs")
+    # parser.add_argument("--data_path", type=str, help="Path to the dataset")
+    # parser.add_argument("--get_results", action="store_true")
 
     return parser.parse_args()
 
+
 def train_config(args):
     # GPT-3 126M
     # This example will generate 3 configs.
@@ -52,18 +55,19 @@ def train_config(args):
         tokenizer_path="/home/models/gpt2",
         data_paths=["/home/data/test_text_document"],
         path_to_logs="/home/scripts/test_autoconf",
-        #data_paths=args.data_path,
+        # data_paths=args.data_path,
     )
 
     # Get generated configs
     configs = generate_configs(runner)
 
-    #for name, config in configs.items():
-        #print(config)
+    # for name, config in configs.items():
+    # print(config)
     cfgs = list(configs.values())
     pretrain = fdl.build(cfgs[0])
     pretrain()
 
+
 def main():
     args = get_args()
 

From eda32ce89ee8541be300db4b59ee488b02790947 Mon Sep 17 00:00:00 2001
From: dimapihtar <dpihtar@gmail.com>
Date: Wed, 4 Sep 2024 11:23:48 -0700
Subject: [PATCH 44/63] fix performance calculation

Signed-off-by: dimapihtar <dpihtar@gmail.com>
---
 examples/llm/auto_configurator/auto_config.py | 58 +++++++------------
 .../core/calculate_performance.py             | 58 +++++++++----------
 .../llm/tools/auto_configurator/core/utils.py |  2 +-
 .../llm/tools/auto_configurator/runner.py     | 12 ++--
 4 files changed, 54 insertions(+), 76 deletions(-)

diff --git a/examples/llm/auto_configurator/auto_config.py b/examples/llm/auto_configurator/auto_config.py
index f09b3b4215c1..259e171b9471 100644
--- a/examples/llm/auto_configurator/auto_config.py
+++ b/examples/llm/auto_configurator/auto_config.py
@@ -25,10 +25,10 @@
 
 def get_args():
     parser = argparse.ArgumentParser()
-    # parser.add_argument("--run_number", type=int, help="Number of config to run")
-    # parser.add_argument("--logs_dir", type=str, help="Path where to save training logs")
-    # parser.add_argument("--data_path", type=str, help="Path to the dataset")
-    # parser.add_argument("--get_results", action="store_true")
+    parser.add_argument("--run_number", type=int, help="Number of config to run")
+    parser.add_argument("--logs_dir", type=str, help="Path where to save training logs")
+    parser.add_argument("--data_path", type=str, help="Path to the dataset")
+    parser.add_argument("--get_results", action="store_true")
 
     return parser.parse_args()
 
@@ -53,27 +53,18 @@ def train_config(args):
         num_tokens_in_b=10,
         vocab_size=51200,
         tokenizer_path="/home/models/gpt2",
-        data_paths=["/home/data/test_text_document"],
-        path_to_logs="/home/scripts/test_autoconf",
-        # data_paths=args.data_path,
+        data_paths=args.data_path,
+        path_to_logs=args.logs_dir,
     )
 
-    # Get generated configs
-    configs = generate_configs(runner)
-
-    # for name, config in configs.items():
-    # print(config)
-    cfgs = list(configs.values())
-    pretrain = fdl.build(cfgs[0])
-    pretrain()
-
-
-def main():
-    args = get_args()
-
-    if True:
-        train_config(args)
+    base_cfg, configs = generate_configs(runner)
+    if not args.get_results:
+        # Get generated configs
+        configs = list(configs.values())
 
+        # Run pre-training
+        pretrain = fdl.build(configs[args.run_number - 1])
+        pretrain()
     else:
         # Get Auto Configurator results
         candidates = [d for d in os.listdir(args.logs_dir) if os.path.isdir(os.path.join(args.logs_dir, d))]
@@ -87,24 +78,15 @@ def main():
 
                 os.rmdir(default_dir)
 
-        get_results(
-            training_logs=args.logs_dir,
-            path_to_save=args.logs_dir,
-            model_name="gpt3",
-            model_version=3,
-            model_size=126,
-            model_measure="M",
-            num_nodes=1,
-            gpus_per_node=1,
-            global_batch_size=16,
-            seq_length=512,
-            max_training_days=1,
-            num_tokens_in_b=10,
-            vocab_size=51200,
-        )
-
+        get_results(base_cfg, runner, args.logs_dir)
         print(f"The results were successfully saved to {args.logs_dir}.")
 
 
+def main():
+    args = get_args()
+
+    train_config(args)
+
+
 if __name__ == '__main__':
     main()
diff --git a/nemo/collections/llm/tools/auto_configurator/core/calculate_performance.py b/nemo/collections/llm/tools/auto_configurator/core/calculate_performance.py
index 160342176c6d..020e1cbac36d 100644
--- a/nemo/collections/llm/tools/auto_configurator/core/calculate_performance.py
+++ b/nemo/collections/llm/tools/auto_configurator/core/calculate_performance.py
@@ -24,7 +24,8 @@
 
 
 def get_results(
-    config=None,
+    base_config=None,
+    train_config=None,
     path_to_save: str = None,
     output_top_n: Optional[int] = 10,
 ):
@@ -36,19 +37,15 @@ def get_results(
         output_top_n (Optional[int]): Number of configs to be printed out as best configs.
     """
 
-    # Get model architecture
-    base_cfg, _ = generic_base_config(config)
+    layers = base_config.model.num_layers
+    hs = base_config.model.hidden_size
+    ffn_hs = base_config.model.ffn_hidden_size
 
-    layers = base_cfg.model.num_layers
-    hs = base_cfg.model.hidden_size
-    ffn_hs = base_cfg.model.ffn_hidden_size
-
-    training_logs = training_logs
+    training_logs = path_to_save
     final_result_logs = path_to_save
 
     result_columns = [
-        "Model Name",
-        "Model Size",
+        "Model Config",
         "Seq Length",
         "TP",
         "PP",
@@ -97,7 +94,7 @@ def get_results(
         dirs.pop(0)
 
     for candidate_dir in dirs:
-        logs_dir = os.path.join(training_logs, candidate_dir)
+        logs_dir = os.path.join(training_logs, candidate_dir, "lightning_logs")
         logs_folder = [f.path for f in os.scandir(logs_dir) if f.is_dir()][0]
         tp, pp, cp, ep, mbs, act_ckpt, num_mbs_act, act_per_pipe = get_config(candidate_dir)
 
@@ -141,26 +138,25 @@ def get_results(
                         continue
                     timing_list = [x.value for x in timing_list[5:]]
                     avg_global_step_time = round(sum(timing_list) / len(timing_list), 4)
-                    samples_per_s = round(global_batch_size / avg_global_step_time, 2)
+                    samples_per_s = round(base_config.data.global_batch_size / avg_global_step_time, 2)
                     m_tflops, m_tflops_gpu = calculate_tflops(
-                        model_name=model_name,
-                        gbs=global_batch_size,
-                        enc_seq_len=seq_length,
-                        dec_seq_len=seq_length,
+                        model_name=train_config.model_type,
+                        gbs=base_config.data.global_batch_size,
+                        enc_seq_len=base_config.data.seq_length,
+                        dec_seq_len=base_config.data.seq_length,
                         hs=hs,
                         ffn_hs=ffn_hs,
                         layers=layers,
-                        vocab=vocab_size,
-                        nodes=num_nodes,
-                        gpus_per_node=gpus_per_node,
+                        vocab=train_config.vocab_size,
+                        nodes=train_config.num_nodes,
+                        gpus_per_node=train_config.gpus_per_node,
                         time_per_step=avg_global_step_time,
                     )
                     config_name = f"tp{tp}_pp{pp}_cp{cp}_ep{ep}_mbs{mbs}_act_{act_ckpt}_num_mbs_act_{num_mbs_act}_act_per_pipe_{act_per_pipe}"
                     result.append(
                         [
-                            model_name,
-                            model_size,
-                            seq_length,
+                            base_config.model.__class__.__name__,
+                            base_config.data.seq_length,
                             tp,
                             pp,
                             cp,
@@ -172,9 +168,9 @@ def get_results(
                             layers,
                             hs,
                             ffn_hs,
-                            global_batch_size,
-                            num_nodes,
-                            gpus_per_node,
+                            base_config.data.global_batch_size,
+                            train_config.num_nodes,
+                            train_config.gpus_per_node,
                             avg_global_step_time,
                             samples_per_s,
                             m_tflops_gpu,
@@ -184,25 +180,25 @@ def get_results(
                     )
                 finally:
                     continue
-    result.sort(key=lambda x: x[17])
+    result.sort(key=lambda x: x[16])
     print(f"Top {min(output_top_n, len(result))} configs sorted from fastest to slowest:")
     for i, res in enumerate(result):
-        print(f"Config #{i+1}: {res[-1]} with {res[17]:.4f}s per global step.")
+        print(f"Config #{i+1}: {res[-1]} with {res[16]:.4f}s per global step.")
         if i + 1 == output_top_n:
             break
 
-    top_config = f"{model_name}_{model_size}b_{num_nodes}nodes_tp_{result[0][3]}_pp_{result[0][4]}_cp_{result[0][5]}_ep_{result[0][6]}_mbs_{result[0][7]}_act_ckpt_{result[0][8]}_num_mbs_act_{result[0][9]}_act_per_pipe_{result[0][10]}"
+    top_config = f"{train_config.model.__class__.__name__}_{train_config.num_nodes}nodes_tp_{result[0][3]}_pp_{result[0][4]}_cp_{result[0][5]}_ep_{result[0][6]}_mbs_{result[0][7]}_act_ckpt_{result[0][8]}_num_mbs_act_{result[0][9]}_act_per_pipe_{result[0][10]}"
     print("\n==================================================")
-    print(f"Optimal config: {top_config} with {result[0][17]:.4f}s per global step.")
+    print(f"Optimal config: {top_config} with {result[0][16]:.4f}s per global step.")
     print("==================================================\n")
 
     # Save results as a CSV file.
     os.makedirs(final_result_logs, exist_ok=True)
     result_df = pd.DataFrame(result, columns=result_columns)
-    result_df.to_csv(os.path.join(final_result_logs, f"final_summary_{num_nodes}nodes.csv"), index=False)
+    result_df.to_csv(os.path.join(final_result_logs, f"final_summary_{train_config.num_nodes}nodes.csv"), index=False)
 
     error_df = pd.DataFrame(errors, columns=error_columns)
-    error_df.to_csv(os.path.join(final_result_logs, f"failed_jobs_{num_nodes}nodes.csv"), index=False)
+    error_df.to_csv(os.path.join(final_result_logs, f"failed_jobs_{train_config.num_nodes}nodes.csv"), index=False)
 
 
 def calculate_tflops(
diff --git a/nemo/collections/llm/tools/auto_configurator/core/utils.py b/nemo/collections/llm/tools/auto_configurator/core/utils.py
index cd3d9bdc7bac..d4d3d5b90179 100644
--- a/nemo/collections/llm/tools/auto_configurator/core/utils.py
+++ b/nemo/collections/llm/tools/auto_configurator/core/utils.py
@@ -465,7 +465,7 @@ def modify_cfg(
         # Valid config
         new_cfg["run"][
             "name"
-        ] = f"{new_cfg['run']['name']}_{num_nodes}nodes_tp_{tp}_pp_{pp}_cp_{cp}_ep_{ep}_mbs_{mbs}_act_ckpt_{act}_num_mbs_act_{num_mbs_act}_act_per_pipe_{act_per_pipe}"
+        ] = f"{base_cfg.model.__class__.__name__}_{num_nodes}nodes_tp_{tp}_pp_{pp}_cp_{cp}_ep_{ep}_mbs_{mbs}_act_ckpt_{act}_num_mbs_act_{num_mbs_act}_act_per_pipe_{act_per_pipe}"
         print(
             f"Valid config: SeqLen={seq_len}, GBS={gbs}, MBS={mbs}, TP={tp}, PP={pp}, CP={cp}, EP={ep}, act_ckpt_layers={act}, num_mbs_act={num_mbs_act}, act_per_pipe={act_per_pipe}. Adding to directory."
         )
diff --git a/nemo/collections/llm/tools/auto_configurator/runner.py b/nemo/collections/llm/tools/auto_configurator/runner.py
index 7253fec962ed..2c0a488b34b0 100644
--- a/nemo/collections/llm/tools/auto_configurator/runner.py
+++ b/nemo/collections/llm/tools/auto_configurator/runner.py
@@ -192,7 +192,7 @@ def _get_model_size(self, config_string) -> int:
         return None
 
 
-def generate_configs(config: AutoConfigurator = None) -> dict:
+def generate_configs(runner_config: AutoConfigurator = None) -> dict:
     """
     Function that returns a dictionary of Partial configs.
 
@@ -204,7 +204,7 @@ def generate_configs(config: AutoConfigurator = None) -> dict:
     """
 
     # Generate base config for the given model size
-    base_cfg, train_cfg = generic_base_config(config)
+    base_cfg, train_cfg = generic_base_config(runner_config)
 
     # Launch grid search for training constraints
     base_config, train_configs = generate_grid_search_configs(base_cfg, train_cfg)
@@ -232,8 +232,8 @@ def generate_configs(config: AutoConfigurator = None) -> dict:
         )
 
         # Set the directory where to save the logs
-        log.dir = os.path.join(log.dir, name)
-        log.tensorboard.save_dir = os.path.join(log.dir, name)
+        log.dir = os.path.join(runner_config.path_to_logs, name)
+        log.tensorboard.save_dir = log.dir
 
         configs[name] = Partial(
             pretrain,
@@ -241,8 +241,8 @@ def generate_configs(config: AutoConfigurator = None) -> dict:
             trainer=trainer,
             data=data,
             optim=base_config.optim,
-            log=base_config.log,
+            log=log,
             resume=None,
         )
 
-    return configs
+    return base_cfg, configs

From ae46957147be0ce0dba7e5433bcf8f050be10f5e Mon Sep 17 00:00:00 2001
From: dimapihtar <dpihtar@gmail.com>
Date: Thu, 5 Sep 2024 04:55:52 -0700
Subject: [PATCH 45/63] fix end-to-end example

Signed-off-by: dimapihtar <dpihtar@gmail.com>
---
 examples/llm/auto_configurator/auto_config.py | 15 +-----
 .../core/calculate_performance.py             | 48 +++++++++++--------
 2 files changed, 29 insertions(+), 34 deletions(-)

diff --git a/examples/llm/auto_configurator/auto_config.py b/examples/llm/auto_configurator/auto_config.py
index 259e171b9471..c1af38c59b27 100644
--- a/examples/llm/auto_configurator/auto_config.py
+++ b/examples/llm/auto_configurator/auto_config.py
@@ -52,7 +52,6 @@ def train_config(args):
         max_steps_per_run=25,
         num_tokens_in_b=10,
         vocab_size=51200,
-        tokenizer_path="/home/models/gpt2",
         data_paths=args.data_path,
         path_to_logs=args.logs_dir,
     )
@@ -66,25 +65,13 @@ def train_config(args):
         pretrain = fdl.build(configs[args.run_number - 1])
         pretrain()
     else:
-        # Get Auto Configurator results
-        candidates = [d for d in os.listdir(args.logs_dir) if os.path.isdir(os.path.join(args.logs_dir, d))]
-        for subdir in candidates:
-            default_dir = os.path.join(args.logs_dir, subdir, "default")
-            if os.path.exists(default_dir) and os.path.isdir(default_dir):
-                for item in os.listdir(default_dir):
-                    s = os.path.join(default_dir, item)
-                    d = os.path.join(args.logs_dir, subdir, item)
-                    shutil.move(s, d)
-
-                os.rmdir(default_dir)
-
+        # # Get Auto Configurator results
         get_results(base_cfg, runner, args.logs_dir)
         print(f"The results were successfully saved to {args.logs_dir}.")
 
 
 def main():
     args = get_args()
-
     train_config(args)
 
 
diff --git a/nemo/collections/llm/tools/auto_configurator/core/calculate_performance.py b/nemo/collections/llm/tools/auto_configurator/core/calculate_performance.py
index 020e1cbac36d..180610034f3f 100644
--- a/nemo/collections/llm/tools/auto_configurator/core/calculate_performance.py
+++ b/nemo/collections/llm/tools/auto_configurator/core/calculate_performance.py
@@ -37,6 +37,16 @@ def get_results(
         output_top_n (Optional[int]): Number of configs to be printed out as best configs.
     """
 
+    # Define needed variables
+    model_name = train_config.model_type
+    config_name = base_config.model.__class__.__name__
+    global_batch_size = base_config.data.global_batch_size
+    seq_length=base_config.data.seq_length
+
+    vocab_size = train_config.vocab_size
+    num_nodes = train_config.num_nodes
+    gpus_per_node = train_config.gpus_per_node
+
     layers = base_config.model.num_layers
     hs = base_config.model.hidden_size
     ffn_hs = base_config.model.ffn_hidden_size
@@ -68,8 +78,7 @@ def get_results(
         "Config Name",
     ]
     error_columns = [
-        "Model Name",
-        "Model Size",
+        "Model Config",
         "Seq Length",
         "TP",
         "PP",
@@ -105,8 +114,7 @@ def get_results(
                 if error:
                     errors.append(
                         [
-                            model_name,
-                            model_size,
+                            config_name,
                             seq_length,
                             tp,
                             pp,
@@ -138,25 +146,25 @@ def get_results(
                         continue
                     timing_list = [x.value for x in timing_list[5:]]
                     avg_global_step_time = round(sum(timing_list) / len(timing_list), 4)
-                    samples_per_s = round(base_config.data.global_batch_size / avg_global_step_time, 2)
+                    samples_per_s = round(global_batch_size / avg_global_step_time, 2)
                     m_tflops, m_tflops_gpu = calculate_tflops(
-                        model_name=train_config.model_type,
-                        gbs=base_config.data.global_batch_size,
-                        enc_seq_len=base_config.data.seq_length,
-                        dec_seq_len=base_config.data.seq_length,
+                        model_name=model_name,
+                        gbs=global_batch_size,
+                        enc_seq_len=seq_length,
+                        dec_seq_len=seq_length,
                         hs=hs,
                         ffn_hs=ffn_hs,
                         layers=layers,
-                        vocab=train_config.vocab_size,
-                        nodes=train_config.num_nodes,
-                        gpus_per_node=train_config.gpus_per_node,
+                        vocab=vocab_size,
+                        nodes=num_nodes,
+                        gpus_per_node=gpus_per_node,
                         time_per_step=avg_global_step_time,
                     )
                     config_name = f"tp{tp}_pp{pp}_cp{cp}_ep{ep}_mbs{mbs}_act_{act_ckpt}_num_mbs_act_{num_mbs_act}_act_per_pipe_{act_per_pipe}"
                     result.append(
                         [
-                            base_config.model.__class__.__name__,
-                            base_config.data.seq_length,
+                            config_name,
+                            seq_length,
                             tp,
                             pp,
                             cp,
@@ -168,9 +176,9 @@ def get_results(
                             layers,
                             hs,
                             ffn_hs,
-                            base_config.data.global_batch_size,
-                            train_config.num_nodes,
-                            train_config.gpus_per_node,
+                            global_batch_size,
+                            num_nodes,
+                            gpus_per_node,
                             avg_global_step_time,
                             samples_per_s,
                             m_tflops_gpu,
@@ -187,7 +195,7 @@ def get_results(
         if i + 1 == output_top_n:
             break
 
-    top_config = f"{train_config.model.__class__.__name__}_{train_config.num_nodes}nodes_tp_{result[0][3]}_pp_{result[0][4]}_cp_{result[0][5]}_ep_{result[0][6]}_mbs_{result[0][7]}_act_ckpt_{result[0][8]}_num_mbs_act_{result[0][9]}_act_per_pipe_{result[0][10]}"
+    top_config = f"{config_name}_{num_nodes}nodes_tp_{result[0][3]}_pp_{result[0][4]}_cp_{result[0][5]}_ep_{result[0][6]}_mbs_{result[0][7]}_act_ckpt_{result[0][8]}_num_mbs_act_{result[0][9]}_act_per_pipe_{result[0][10]}"
     print("\n==================================================")
     print(f"Optimal config: {top_config} with {result[0][16]:.4f}s per global step.")
     print("==================================================\n")
@@ -195,10 +203,10 @@ def get_results(
     # Save results as a CSV file.
     os.makedirs(final_result_logs, exist_ok=True)
     result_df = pd.DataFrame(result, columns=result_columns)
-    result_df.to_csv(os.path.join(final_result_logs, f"final_summary_{train_config.num_nodes}nodes.csv"), index=False)
+    result_df.to_csv(os.path.join(final_result_logs, f"final_summary_{num_nodes}nodes.csv"), index=False)
 
     error_df = pd.DataFrame(errors, columns=error_columns)
-    error_df.to_csv(os.path.join(final_result_logs, f"failed_jobs_{train_config.num_nodes}nodes.csv"), index=False)
+    error_df.to_csv(os.path.join(final_result_logs, f"failed_jobs_{num_nodes}nodes.csv"), index=False)
 
 
 def calculate_tflops(

From 1fe46ed178dffec97ff2bd7735410494e59b72da Mon Sep 17 00:00:00 2001
From: dimapihtar <dimapihtar@users.noreply.github.com>
Date: Thu, 5 Sep 2024 11:56:45 +0000
Subject: [PATCH 46/63] Apply isort and black reformatting

Signed-off-by: dimapihtar <dimapihtar@users.noreply.github.com>
---
 .../llm/tools/auto_configurator/core/calculate_performance.py   | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/nemo/collections/llm/tools/auto_configurator/core/calculate_performance.py b/nemo/collections/llm/tools/auto_configurator/core/calculate_performance.py
index 180610034f3f..0aafffea665c 100644
--- a/nemo/collections/llm/tools/auto_configurator/core/calculate_performance.py
+++ b/nemo/collections/llm/tools/auto_configurator/core/calculate_performance.py
@@ -41,7 +41,7 @@ def get_results(
     model_name = train_config.model_type
     config_name = base_config.model.__class__.__name__
     global_batch_size = base_config.data.global_batch_size
-    seq_length=base_config.data.seq_length
+    seq_length = base_config.data.seq_length
 
     vocab_size = train_config.vocab_size
     num_nodes = train_config.num_nodes

From 38082d9d50d5a4739a10347553ba700e8716c49e Mon Sep 17 00:00:00 2001
From: dimapihtar <dpihtar@gmail.com>
Date: Thu, 5 Sep 2024 13:02:56 -0700
Subject: [PATCH 47/63] fix model config

Signed-off-by: dimapihtar <dpihtar@gmail.com>
---
 examples/llm/auto_configurator/auto_config.py |  6 ++---
 .../llm/tools/auto_configurator/__init__.py   |  2 +-
 .../auto_configurator/core/base_config.py     |  9 ++++---
 .../core/calculate_performance.py             |  6 ++---
 .../llm/tools/auto_configurator/runner.py     | 27 +++++++++++--------
 5 files changed, 27 insertions(+), 23 deletions(-)

diff --git a/examples/llm/auto_configurator/auto_config.py b/examples/llm/auto_configurator/auto_config.py
index c1af38c59b27..1deb30a1afe8 100644
--- a/examples/llm/auto_configurator/auto_config.py
+++ b/examples/llm/auto_configurator/auto_config.py
@@ -17,10 +17,10 @@
 import shutil
 
 import fiddle as fdl
+import nemo_run as run
 
 from nemo.collections.llm import GPTConfig126M
-from nemo.collections.llm.tools.auto_configurator import AutoConfigurator, get_results
-from nemo.collections.llm.tools.auto_configurator.runner import generate_configs
+from nemo.collections.llm.tools.auto_configurator import AutoConfigurator, get_results, generate_configs
 
 
 def get_args():
@@ -39,7 +39,7 @@ def train_config(args):
     # It is expected that this script will be run 3 times with changing --run_number flag for each run from 0 to 2.
     # After all configurations are trained, please trigger the script using --get_results flag.
     runner = AutoConfigurator(
-        model=GPTConfig126M(),
+        model=run.Config(GPTConfig126M),
         num_nodes=1,
         gpus_per_node=1,
         gpu_memory_gb=40,
diff --git a/nemo/collections/llm/tools/auto_configurator/__init__.py b/nemo/collections/llm/tools/auto_configurator/__init__.py
index ac4d7e216725..5c6bde2c285a 100644
--- a/nemo/collections/llm/tools/auto_configurator/__init__.py
+++ b/nemo/collections/llm/tools/auto_configurator/__init__.py
@@ -1,2 +1,2 @@
 from nemo.collections.llm.tools.auto_configurator.core.calculate_performance import get_results
-from nemo.collections.llm.tools.auto_configurator.runner import AutoConfigurator
+from nemo.collections.llm.tools.auto_configurator.runner import AutoConfigurator, generate_configs
diff --git a/nemo/collections/llm/tools/auto_configurator/core/base_config.py b/nemo/collections/llm/tools/auto_configurator/core/base_config.py
index 78e376c7bf36..00990333d132 100644
--- a/nemo/collections/llm/tools/auto_configurator/core/base_config.py
+++ b/nemo/collections/llm/tools/auto_configurator/core/base_config.py
@@ -86,8 +86,10 @@ def get_optim(self) -> Config[OptimizerConfig]:
             "bf16": True,
             "adam_beta1": 0.9,
             "adam_beta2": 0.95,
-            "overlap_grad_reduce": False,
+            "overlap_grad_reduce": True,
             "overlap_param_gather": True,
+            "clip_grad": 1.0,
+            "adam_eps": 1e-5,
         }
 
         optim_config = Config(
@@ -124,7 +126,6 @@ def get_trainer(self) -> Config[nl.Trainer]:
             "limit_val_batches": 1,
             "limit_test_batches": 1,
             "accumulate_grad_batches": 1,
-            "gradient_clip_val": 1.0,
             "num_nodes": self.config.num_nodes,
             "devices": self.config.num_gpus,
             "max_steps": self.config.max_steps_per_run,
@@ -198,7 +199,7 @@ def get_logger(self) -> Config[nl.NeMoLogger]:
         """
 
         # Define TensorBoard Logger
-        tb_logger = Config(TensorBoardLogger, save_dir=self.config.path_to_logs)
+        tb_logger = Config(TensorBoardLogger, save_dir="tb_logs")
 
         ckpt = Config(
             nl.ModelCheckpoint,
@@ -213,7 +214,7 @@ def get_logger(self) -> Config[nl.NeMoLogger]:
             ckpt=ckpt,
             tensorboard=tb_logger,
             wandb=None,
-            dir=self.config.path_to_logs,
+            dir="/nemo_run",
         )
 
     def get_run_config(self) -> dict:
diff --git a/nemo/collections/llm/tools/auto_configurator/core/calculate_performance.py b/nemo/collections/llm/tools/auto_configurator/core/calculate_performance.py
index 0aafffea665c..77091a7f37a2 100644
--- a/nemo/collections/llm/tools/auto_configurator/core/calculate_performance.py
+++ b/nemo/collections/llm/tools/auto_configurator/core/calculate_performance.py
@@ -98,12 +98,10 @@ def get_results(
     ]
     result = []
     errors = []
-    dirs = os.listdir(training_logs)
-    if ".sdk" in dirs:
-        dirs.pop(0)
+    dirs = [f.path for f in os.scandir(training_logs) if f.is_dir()]
 
     for candidate_dir in dirs:
-        logs_dir = os.path.join(training_logs, candidate_dir, "lightning_logs")
+        logs_dir = os.path.join(training_logs, candidate_dir, "tb_logs/lightning_logs")
         logs_folder = [f.path for f in os.scandir(logs_dir) if f.is_dir()][0]
         tp, pp, cp, ep, mbs, act_ckpt, num_mbs_act, act_per_pipe = get_config(candidate_dir)
 
diff --git a/nemo/collections/llm/tools/auto_configurator/runner.py b/nemo/collections/llm/tools/auto_configurator/runner.py
index 2c0a488b34b0..08facfd07450 100644
--- a/nemo/collections/llm/tools/auto_configurator/runner.py
+++ b/nemo/collections/llm/tools/auto_configurator/runner.py
@@ -19,6 +19,8 @@
 from dataclasses import dataclass
 from typing import List, Optional
 
+import nemo_run as run
+
 from nemo.collections.llm import GPTModel
 from nemo.collections.llm.api import pretrain
 from nemo.collections.llm.tools.auto_configurator.core.training_config import generate_grid_search_configs
@@ -108,7 +110,7 @@ def __init__(
         config.pop('self')
         logging.info(self._get_message(config))
 
-        model_type = self._get_model_type(model.__class__.__name__)
+        model_type = self._get_model_type(model)
         assert model_type in SUPPORTED_MODELS, f"model_type must be set to one of {SUPPORTED_MODELS}."
         assert tokenizer_type in SUPPORTED_TOKENIZERS, f"tokenizer_type must be set to one of {SUPPORTED_TOKENIZERS}."
         assert num_nodes, "num_nodes value must be specified."
@@ -123,7 +125,7 @@ def __init__(
         assert max_minutes_per_run >= 10, "max_minutes_per_run must be an int and be at least 10 minutes."
 
         self.model_type = model_type
-        self.model_size_in_b = self._get_model_size(model.__class__.__name__)
+        self.model_size_in_b = self._get_model_size(model)
         self.gpu_count = gpu_count
         self.num_gpus = gpus_per_node
 
@@ -145,17 +147,21 @@ def _get_message(self, config: dict) -> str:
 
         return message
 
-    def _get_model_type(self, model: str) -> str:
+    def _get_model_type(self, model: Config) -> str:
         """
         Function that returns model type from model class name.
 
         Args:
-            models (str): model class name.
+            models (Config): model object.
 
         Returns:
             str: model type.
         """
 
+        match = re.search(r"\w+\d+[MB]", str(model))
+        if match:
+            model = match.group(0)
+
         if "GPT" in model:
             return "gpt3"
         elif "Llama" in model:
@@ -171,17 +177,17 @@ def _get_model_type(self, model: str) -> str:
         else:
             return None
 
-    def _get_model_size(self, config_string) -> int:
+    def _get_model_size(self, model: Config) -> int:
         """
         Function that returns model size from model class name.
 
         Args:
-            models (str): model class name.
+            model (Config): model class name.
 
         Returns:
             int: model size.
         """
-        match = re.search(r'(\d+)([BM])', config_string)
+        match = re.search(r'(\d+)([BM])', str(model))
         if match:
             size = int(match.group(1))
             measure = match.group(2)
@@ -210,7 +216,7 @@ def generate_configs(runner_config: AutoConfigurator = None) -> dict:
     base_config, train_configs = generate_grid_search_configs(base_cfg, train_cfg)
 
     tokenizer = base_config.tokenizer
-    model = GPTModel(base_config.model, tokenizer=tokenizer)
+    model = Config(GPTModel, config=base_config.model, tokenizer=tokenizer)
 
     configs = {}
     for name, config in train_configs.items():
@@ -230,11 +236,10 @@ def generate_configs(runner_config: AutoConfigurator = None) -> dict:
         trainer.strategy.virtual_pipeline_model_parallel_size = config.get(
             "virtual_pipeline_model_parallel_size", None
         )
+        if config.get("tensor_model_parallel_size") > 1:
+            trainer.strategy.sequence_parallel = True
 
         # Set the directory where to save the logs
-        log.dir = os.path.join(runner_config.path_to_logs, name)
-        log.tensorboard.save_dir = log.dir
-
         configs[name] = Partial(
             pretrain,
             model=model,

From 0ce1672eb7ce2f51944bc1bfb34ad9b2ef1bcd06 Mon Sep 17 00:00:00 2001
From: dimapihtar <dimapihtar@users.noreply.github.com>
Date: Thu, 5 Sep 2024 20:03:54 +0000
Subject: [PATCH 48/63] Apply isort and black reformatting

Signed-off-by: dimapihtar <dimapihtar@users.noreply.github.com>
---
 examples/llm/auto_configurator/auto_config.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/llm/auto_configurator/auto_config.py b/examples/llm/auto_configurator/auto_config.py
index 1deb30a1afe8..0ef10ae7e727 100644
--- a/examples/llm/auto_configurator/auto_config.py
+++ b/examples/llm/auto_configurator/auto_config.py
@@ -20,7 +20,7 @@
 import nemo_run as run
 
 from nemo.collections.llm import GPTConfig126M
-from nemo.collections.llm.tools.auto_configurator import AutoConfigurator, get_results, generate_configs
+from nemo.collections.llm.tools.auto_configurator import AutoConfigurator, generate_configs, get_results
 
 
 def get_args():

From 41c9f2990a08f23f33820ddde60fc91d108dd761 Mon Sep 17 00:00:00 2001
From: dimapihtar <dpihtar@gmail.com>
Date: Thu, 5 Sep 2024 13:38:36 -0700
Subject: [PATCH 49/63] minor changes

Signed-off-by: dimapihtar <dpihtar@gmail.com>
---
 .../core/calculate_performance.py             | 24 ++++++++++---------
 .../auto_configurator/core/training_config.py |  1 +
 .../llm/tools/auto_configurator/core/utils.py |  3 ++-
 3 files changed, 16 insertions(+), 12 deletions(-)

diff --git a/nemo/collections/llm/tools/auto_configurator/core/calculate_performance.py b/nemo/collections/llm/tools/auto_configurator/core/calculate_performance.py
index 77091a7f37a2..43f1c6117929 100644
--- a/nemo/collections/llm/tools/auto_configurator/core/calculate_performance.py
+++ b/nemo/collections/llm/tools/auto_configurator/core/calculate_performance.py
@@ -39,7 +39,7 @@ def get_results(
 
     # Define needed variables
     model_name = train_config.model_type
-    config_name = base_config.model.__class__.__name__
+    model_size = train_config.model_size_in_b
     global_batch_size = base_config.data.global_batch_size
     seq_length = base_config.data.seq_length
 
@@ -55,7 +55,8 @@ def get_results(
     final_result_logs = path_to_save
 
     result_columns = [
-        "Model Config",
+        "Model Name",
+        "Model Size",
         "Seq Length",
         "TP",
         "PP",
@@ -75,10 +76,10 @@ def get_results(
         "Samples per Second",
         "Model TFLOPS / GPU",
         "Model TFLOPS Aggregate",
-        "Config Name",
     ]
     error_columns = [
-        "Model Config",
+        "Model Name",
+        "Model Size",
         "Seq Length",
         "TP",
         "PP",
@@ -112,7 +113,8 @@ def get_results(
                 if error:
                     errors.append(
                         [
-                            config_name,
+                            model_name,
+                            model_size,
                             seq_length,
                             tp,
                             pp,
@@ -161,7 +163,8 @@ def get_results(
                     config_name = f"tp{tp}_pp{pp}_cp{cp}_ep{ep}_mbs{mbs}_act_{act_ckpt}_num_mbs_act_{num_mbs_act}_act_per_pipe_{act_per_pipe}"
                     result.append(
                         [
-                            config_name,
+                            model_name,
+                            model_size,
                             seq_length,
                             tp,
                             pp,
@@ -181,21 +184,20 @@ def get_results(
                             samples_per_s,
                             m_tflops_gpu,
                             m_tflops,
-                            config_name,
                         ]
                     )
                 finally:
                     continue
-    result.sort(key=lambda x: x[16])
+    result.sort(key=lambda x: x[17])
     print(f"Top {min(output_top_n, len(result))} configs sorted from fastest to slowest:")
     for i, res in enumerate(result):
-        print(f"Config #{i+1}: {res[-1]} with {res[16]:.4f}s per global step.")
+        print(f"Config #{i+1}: {res[-1]} with {res[17]:.4f}s per global step.")
         if i + 1 == output_top_n:
             break
 
-    top_config = f"{config_name}_{num_nodes}nodes_tp_{result[0][3]}_pp_{result[0][4]}_cp_{result[0][5]}_ep_{result[0][6]}_mbs_{result[0][7]}_act_ckpt_{result[0][8]}_num_mbs_act_{result[0][9]}_act_per_pipe_{result[0][10]}"
+    top_config = f"{model_name}_{model_size}b_{num_nodes}nodes_tp_{result[0][3]}_pp_{result[0][4]}_cp_{result[0][5]}_ep_{result[0][6]}_mbs_{result[0][7]}_act_ckpt_{result[0][8]}_num_mbs_act_{result[0][9]}_act_per_pipe_{result[0][10]}"
     print("\n==================================================")
-    print(f"Optimal config: {top_config} with {result[0][16]:.4f}s per global step.")
+    print(f"Optimal config: {top_config} with {result[0][17]:.4f}s per global step.")
     print("==================================================\n")
 
     # Save results as a CSV file.
diff --git a/nemo/collections/llm/tools/auto_configurator/core/training_config.py b/nemo/collections/llm/tools/auto_configurator/core/training_config.py
index 2d3ab169f5b5..1a7c629aa583 100644
--- a/nemo/collections/llm/tools/auto_configurator/core/training_config.py
+++ b/nemo/collections/llm/tools/auto_configurator/core/training_config.py
@@ -141,6 +141,7 @@ def generate_grid_search_configs(
                 "max_steps": max_steps,
                 "num_nodes": num_nodes,
                 "model_name": model_name,
+                "model_size": model_size_in_b,
             }
             if act_ckpt_layers[0] is not None:
                 if act_layers is not None and act_layers != "auto":
diff --git a/nemo/collections/llm/tools/auto_configurator/core/utils.py b/nemo/collections/llm/tools/auto_configurator/core/utils.py
index d4d3d5b90179..be9a6ed3e3a1 100644
--- a/nemo/collections/llm/tools/auto_configurator/core/utils.py
+++ b/nemo/collections/llm/tools/auto_configurator/core/utils.py
@@ -395,6 +395,7 @@ def modify_cfg(
     max_steps: int,
     num_nodes: int,
     model_name: str,
+    model_size,
 ) -> dict:
     """Modify the base configuration for the model with the new parameters that are specific to the current model, which the Auto Configurator tool heuristics selected.
 
@@ -465,7 +466,7 @@ def modify_cfg(
         # Valid config
         new_cfg["run"][
             "name"
-        ] = f"{base_cfg.model.__class__.__name__}_{num_nodes}nodes_tp_{tp}_pp_{pp}_cp_{cp}_ep_{ep}_mbs_{mbs}_act_ckpt_{act}_num_mbs_act_{num_mbs_act}_act_per_pipe_{act_per_pipe}"
+        ] = f"{model_name}_{str(model_size)}b_{num_nodes}nodes_tp_{tp}_pp_{pp}_cp_{cp}_ep_{ep}_mbs_{mbs}_act_ckpt_{act}_num_mbs_act_{num_mbs_act}_act_per_pipe_{act_per_pipe}"
         print(
             f"Valid config: SeqLen={seq_len}, GBS={gbs}, MBS={mbs}, TP={tp}, PP={pp}, CP={cp}, EP={ep}, act_ckpt_layers={act}, num_mbs_act={num_mbs_act}, act_per_pipe={act_per_pipe}. Adding to directory."
         )

From 3fdcc83fb1fb6d86af2b8d243f2560358c0b4882 Mon Sep 17 00:00:00 2001
From: dimapihtar <dpihtar@gmail.com>
Date: Thu, 5 Sep 2024 15:35:14 -0700
Subject: [PATCH 50/63] minor changes

Signed-off-by: dimapihtar <dpihtar@gmail.com>
---
 examples/llm/auto_configurator/auto_config.py             | 8 ++++++--
 .../llm/tools/auto_configurator/core/base_config.py       | 2 +-
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/examples/llm/auto_configurator/auto_config.py b/examples/llm/auto_configurator/auto_config.py
index 0ef10ae7e727..abd13185d184 100644
--- a/examples/llm/auto_configurator/auto_config.py
+++ b/examples/llm/auto_configurator/auto_config.py
@@ -52,6 +52,7 @@ def train_config(args):
         max_steps_per_run=25,
         num_tokens_in_b=10,
         vocab_size=51200,
+        tokenizer_path="/home/models/gpt2",
         data_paths=args.data_path,
         path_to_logs=args.logs_dir,
     )
@@ -59,10 +60,13 @@ def train_config(args):
     base_cfg, configs = generate_configs(runner)
     if not args.get_results:
         # Get generated configs
-        configs = list(configs.values())
+        partials = list(configs.values())
+        names = list(configs.keys())
 
         # Run pre-training
-        pretrain = fdl.build(configs[args.run_number - 1])
+        partial = partials[args.run_number - 1]
+        partial.log.dir=os.path.join(args.logs_dir, names[args.run_number - 1])
+        pretrain = fdl.build(partial)
         pretrain()
     else:
         # # Get Auto Configurator results
diff --git a/nemo/collections/llm/tools/auto_configurator/core/base_config.py b/nemo/collections/llm/tools/auto_configurator/core/base_config.py
index 00990333d132..549676f3a86f 100644
--- a/nemo/collections/llm/tools/auto_configurator/core/base_config.py
+++ b/nemo/collections/llm/tools/auto_configurator/core/base_config.py
@@ -214,7 +214,7 @@ def get_logger(self) -> Config[nl.NeMoLogger]:
             ckpt=ckpt,
             tensorboard=tb_logger,
             wandb=None,
-            dir="/nemo_run",
+            dir=self.config.path_to_logs,
         )
 
     def get_run_config(self) -> dict:

From 3fbcc16aba77e7367ea547a0dd65ac82558a903c Mon Sep 17 00:00:00 2001
From: dimapihtar <dimapihtar@users.noreply.github.com>
Date: Thu, 5 Sep 2024 22:36:05 +0000
Subject: [PATCH 51/63] Apply isort and black reformatting

Signed-off-by: dimapihtar <dimapihtar@users.noreply.github.com>
---
 examples/llm/auto_configurator/auto_config.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/llm/auto_configurator/auto_config.py b/examples/llm/auto_configurator/auto_config.py
index abd13185d184..0dc07513e38a 100644
--- a/examples/llm/auto_configurator/auto_config.py
+++ b/examples/llm/auto_configurator/auto_config.py
@@ -65,7 +65,7 @@ def train_config(args):
 
         # Run pre-training
         partial = partials[args.run_number - 1]
-        partial.log.dir=os.path.join(args.logs_dir, names[args.run_number - 1])
+        partial.log.dir = os.path.join(args.logs_dir, names[args.run_number - 1])
         pretrain = fdl.build(partial)
         pretrain()
     else:

From 010e0de22591634b61b61ea0e6a088d4b18d641d Mon Sep 17 00:00:00 2001
From: dimapihtar <dpihtar@gmail.com>
Date: Fri, 6 Sep 2024 05:28:07 -0700
Subject: [PATCH 52/63] fix unit tests

Signed-off-by: dimapihtar <dpihtar@gmail.com>
---
 .../auto_configurator/core/base_config.py     |   2 -
 .../llm/tools/auto_configurator/runner.py     |  13 +-
 .../llm/auto_conf/test_base_configs.py        | 552 +++++++++++-------
 .../llm/auto_conf/test_generate_configs.py    | 467 ++++++---------
 tests/collections/llm/auto_conf/test_utils.py | 100 ++--
 5 files changed, 533 insertions(+), 601 deletions(-)

diff --git a/nemo/collections/llm/tools/auto_configurator/core/base_config.py b/nemo/collections/llm/tools/auto_configurator/core/base_config.py
index 549676f3a86f..ce117dae49be 100644
--- a/nemo/collections/llm/tools/auto_configurator/core/base_config.py
+++ b/nemo/collections/llm/tools/auto_configurator/core/base_config.py
@@ -145,8 +145,6 @@ def get_trainer(self) -> Config[nl.Trainer]:
             callbacks=[Config(TimingCallback)],
         )
 
-        return trainer_config
-
     def get_tokenizer(self, tokenizer_type: str, tokenizer_path: str) -> Config:
         """Function that returns the tokenizer config.
 
diff --git a/nemo/collections/llm/tools/auto_configurator/runner.py b/nemo/collections/llm/tools/auto_configurator/runner.py
index 08facfd07450..278f860acbea 100644
--- a/nemo/collections/llm/tools/auto_configurator/runner.py
+++ b/nemo/collections/llm/tools/auto_configurator/runner.py
@@ -52,7 +52,7 @@ def __init__(
         model: Config = None,
         num_nodes: int = None,
         data_paths: List = None,
-        path_to_logs: Optional[str] = None,
+        path_to_logs: str = None,
         tokenizer_type: Optional[str] = "autotokenizer",
         tokenizer_path: Optional[str] = "GPT2BPETokenizer",
         gpus_per_node: Optional[int] = 8,
@@ -102,12 +102,10 @@ def __init__(
         """
 
         # Print out the config
-        for key, value in locals().items():
-            if key != 'self':
-                setattr(self, key, value)
-
         config = locals()
         config.pop('self')
+        for key, value in config.items():
+            setattr(self, key, value)
         logging.info(self._get_message(config))
 
         model_type = self._get_model_type(model)
@@ -142,8 +140,7 @@ def _get_message(self, config: dict) -> str:
 
         message = "AutoConfigurator runner config:\n"
         for key, value in config.items():
-            if key != "self":
-                message += f"{key}: {value}\n"
+            message += f"{key}: {value}\n"
 
         return message
 
@@ -231,7 +228,7 @@ def generate_configs(runner_config: AutoConfigurator = None) -> dict:
         # Set strategy params
         trainer.strategy.tensor_model_parallel_size = config.get("tensor_model_parallel_size")
         trainer.strategy.pipeline_model_parallel_size = config.get("pipeline_model_parallel_size")
-        trainer.strategy.context_parallel_size = config.get("pipeline_model_parallel_size")
+        trainer.strategy.context_parallel_size = config.get("context_parallel_size")
         trainer.strategy.expert_model_parallel_size = config.get("expert_model_parallel_size")
         trainer.strategy.virtual_pipeline_model_parallel_size = config.get(
             "virtual_pipeline_model_parallel_size", None
diff --git a/tests/collections/llm/auto_conf/test_base_configs.py b/tests/collections/llm/auto_conf/test_base_configs.py
index 847e33cd8ba5..2a93fdd89496 100644
--- a/tests/collections/llm/auto_conf/test_base_configs.py
+++ b/tests/collections/llm/auto_conf/test_base_configs.py
@@ -1,264 +1,364 @@
 import re
+import nemo_run as run
+import torch
 
 from megatron.core.optimizer import OptimizerConfig
 
-from nemo.collections.llm.tools.auto_configurator import base_configs
-from nemo.collections.llm.utils import Config
-
-
-def get_class_name(config_cls):
-    match = re.search(r'<Config\[(\w+)\(', repr(config_cls))
-    config_cls_name = None
-    if match:
-        config_cls_name = match.group(1)
-
-    return config_cls_name
+from pytorch_lightning.loggers import TensorBoardLogger
+
+from nemo.collections.llm.tools.auto_configurator.core.base_config import BaseConfig
+from nemo.collections.llm.tools.auto_configurator import AutoConfigurator
+from nemo.collections.llm import (
+    GPTConfig126M,
+    Llama3Config8B,
+    MistralConfig7B,
+    MixtralConfig8x3B,
+    GemmaConfig2B,
+    Nemotron4Config22B,
+    PreTrainingDataModule
+)
+from nemo.collections.common.tokenizers import AutoTokenizer
+from nemo.lightning.pytorch.optim import CosineAnnealingScheduler, MegatronOptimizerModule
+from nemo.utils.exp_manager import TimingCallback
+from nemo import lightning as nl
+
+
+def get_tokenizer() -> run.Config:
+    return run.Config(AutoTokenizer, pretrained_model_name="GPT2BPETokenizer")
+
+def get_data(seq_length, global_batch_size) -> run.Config[PreTrainingDataModule]:
+    config = {
+        "paths": "/",
+        "seq_length": seq_length,
+        "global_batch_size": global_batch_size,
+        "num_workers": 2,
+        "index_mapping_dir": None,
+    }
+
+    return run.Config(
+        PreTrainingDataModule,
+        **config,
+        tokenizer=get_tokenizer(),
+    )
+
+def get_trainer(num_nodes) -> run.Config[nl.Trainer]:
+    trainer_config = {
+        "accelerator": "gpu",
+        "enable_checkpointing": False,
+        "use_distributed_sampler": False,
+        "max_epochs": None,
+        "log_every_n_steps": 1,
+        "limit_val_batches": 1,
+        "limit_test_batches": 1,
+        "accumulate_grad_batches": 1,
+        "num_nodes": num_nodes,
+        "devices": 8,
+        "max_steps": 50,
+        "val_check_interval": 50,
+    }
+
+    strategy = run.Config(
+        nl.MegatronStrategy,
+        pipeline_dtype=torch.bfloat16,
+    )
+
+    return run.Config(
+        nl.Trainer,
+        **trainer_config,
+        strategy=strategy,
+        plugins=run.Config(nl.MegatronMixedPrecision, precision="bf16-mixed"),
+        callbacks=[run.Config(TimingCallback)],
+    )
+
+def get_optim() -> run.Config[OptimizerConfig]:
+    optim_params = {
+        "optimizer": "adam",
+        "lr": 1e-4,
+        "min_lr": 1e-5,
+        "use_distributed_optimizer": True,
+        "bf16": True,
+        "adam_beta1": 0.9,
+        "adam_beta2": 0.95,
+        "overlap_grad_reduce": True,
+        "overlap_param_gather": True,
+        "clip_grad": 1.0,
+        "adam_eps": 1e-5,
+    }
+
+    optim_config = run.Config(
+        OptimizerConfig,
+        **optim_params,
+    )
+
+    sched = run.Config(
+        CosineAnnealingScheduler,
+        warmup_steps=10,
+        constant_steps=0,
+        min_lr=optim_config.min_lr,
+    )
+
+    return run.Config(
+        MegatronOptimizerModule,
+        config=optim_config,
+        lr_scheduler=sched,
+    )
+
+def get_logger() -> run.Config[nl.NeMoLogger]:
+    tb_logger = run.Config(TensorBoardLogger, save_dir="tb_logs")
+
+    ckpt = run.Config(
+        nl.ModelCheckpoint,
+        monitor="reduced_train_loss",
+        save_best_model=False,
+        save_last=False,
+        save_top_k=0,
+    )
+
+    return run.Config(
+        nl.NeMoLogger,
+        ckpt=ckpt,
+        tensorboard=tb_logger,
+        wandb=None,
+        dir="/",
+    )
 
 
 class TestBaseConfigs:
     def test_gpt3_base_config(self):
-        model_cls = getattr(base_configs, "GPT")
+        # GPT3 7B
+        model_config = run.Config(GPTConfig126M)
+        runner = AutoConfigurator(model=model_config, num_nodes=8, path_to_logs="/", data_paths="/")
+        base_config = BaseConfig(runner)
+        model_size = runner._get_model_size(model_config)
+        model_type = runner._get_model_type(model_config)
+        data_config = get_data(2048, 'auto')
+        trainer_config = get_trainer(8)
+        optim_config = get_optim()
+        logger_config = get_logger()
 
-        # GPT3 126M
-        model_126m = model_cls(size=126, measure="M", cfg={"nemo_sdk": True})
-        config_cls = model_126m.get_model_config()
-        config_cls_name = get_class_name(config_cls)
         assert (
-            config_cls_name == "GPTConfig126M"
-        ), "the name of the config class for the GPT3 126M model should be 'GPTConfig126M'."
-
-        # GPT3 5B
-        model_5b = model_cls(size=5)
-        config_cls = model_5b.get_model_config()
+            base_config.model == model_config
+        ), f"{model_config} is expected class object but got {base_config.model}"
         assert (
-            config_cls.__class__.__name__ == "GPTConfig5B"
-        ), "the name of the config class for the GPT3 5B model should be 'GPTConfig5B'."
-
-        # GPT3 7B
-        model_7b = model_cls(size=7, cfg={"nemo_sdk": True})
-        config_cls = model_7b.get_model_config()
-        config_cls_name = get_class_name(config_cls)
+            model_size == 0.126
+        ), f"0.126 is expected size for {model_config} but got {model_size}"
         assert (
-            config_cls_name == "GPTConfig7B"
-        ), "the name of the config class for the GPT3 7B model should be 'GPTConfig7B'."
-
-        # GPT3 20B
-        model_20b = model_cls(size=20)
-        config_cls = model_20b.get_model_config()
+            model_type == "gpt3"
+        ), f"gpt3 is expected model type for {model_config} but got {model_type}"
         assert (
-            config_cls.__class__.__name__ == "GPTConfig20B"
-        ), "the name of the config class for the GPT3 20B model should be 'GPTConfig20B'."
-
-        # GPT3 40B
-        model_40b = model_cls(size=40)
-        config_cls = model_40b.get_model_config()
+            base_config.data == data_config
+        ), f"f{data_config} is expected data config for {model_config} but got {base_config.data}"
         assert (
-            config_cls.__class__.__name__ == "GPTConfig40B"
-        ), "the name of the config class for the GPT3 40B model should be 'GPTConfig40B'."
-
-        # GPT3 175B
-        model_175b = model_cls(size=175, cfg={"nemo_sdk": True})
-        config_cls = model_175b.get_model_config()
-        config_cls_name = get_class_name(config_cls)
+            base_config.trainer == trainer_config
+        ), f"f{trainer_config} is expected trainer config for {model_config} but got {base_config.trainer}"
         assert (
-            config_cls_name == "GPTConfig175B"
-        ), "the name of the config class for the GPT3 175B model should be 'GPTConfig175B'."
-
-        try:
-            model_111b = model_cls(size=111)
-            config_cls = model_111b.get_model_config()
-            config_cls_name = get_class_name(config_cls)
-            assert (
-                config_cls_name == "GPTConfig111B"
-            ), "the name of the config class for the GPT3 111B model should be 'GPTConfig111B'."
-        except AttributeError:
-            None
+            base_config.optim == optim_config
+        ), f"f{optim_config} is expected trainer config for {model_config} but got {base_config.optim}"
+        assert (
+            base_config.log == logger_config
+        ), f"f{logger_config} is expected trainer config for {model_config} but got {logger_config}"
 
     def test_llama_base_config(self):
-        model_cls = getattr(base_configs, "Llama")
+        # Llama3 8B
+        model_config = run.Config(Llama3Config8B)
+        runner = AutoConfigurator(
+            model=model_config,
+            num_nodes=16,
+            path_to_logs="/",
+            data_paths="/",
+            seq_length=8192,
+            global_batch_size=2048,
+        )
+        base_config = BaseConfig(runner)
+        model_size = runner._get_model_size(model_config)
+        model_type = runner._get_model_type(model_config)
+        data_config = get_data(8192, 2048)
+        trainer_config = get_trainer(16)
+        optim_config = get_optim()
+        logger_config = get_logger()
 
-        # Llama2_7B
-        model_7b = model_cls(size=7, cfg={"nemo_sdk": True})
-        config_cls = model_7b.get_model_config()
-        config_cls_name = get_class_name(config_cls)
         assert (
-            config_cls_name == "Llama2Config7B"
-        ), "the name of the config class for the Llama2 7B model should be 'Llama2Config7B'."
-
-        # Llama2_13B
-        model_13b = model_cls(size=13)
-        config_cls = model_13b.get_model_config()
+            base_config.model == model_config
+        ), f"{model_config} is expected class object but got {base_config.model}"
         assert (
-            config_cls.__class__.__name__ == "Llama2Config13B"
-        ), "the name of the config class for the Llama2 13B model should be 'Llama2Config13B'."
-
-        # Llama2_70B
-        model_70b = model_cls(size=70)
-        config_cls = model_70b.get_model_config()
+            model_size == 8
+        ), f"8 is expected size for {model_config} but got {model_size}"
         assert (
-            config_cls.__class__.__name__ == "Llama2Config70B"
-        ), "the name of the config class for the Llama2 70B model should be 'Llama2Config70B'."
-
-        # Llama3_70B
-        model_70b = model_cls(size=70, version=3)
-        config_cls = model_70b.get_model_config()
+            model_type == "llama"
+        ), f"llama is expected model type for {model_config} but got {model_type}"
         assert (
-            config_cls.__class__.__name__ == "Llama3Config70B"
-        ), "the name of the config class for the Llama3 70B model should be 'Llama3Config70B'."
-
-        # Llama3_8B
-        model_8b = model_cls(size=8, version=3, cfg={"nemo_sdk": True})
-        config_cls = model_8b.get_model_config()
-        config_cls_name = get_class_name(config_cls)
+            base_config.data == data_config
+        ), f"f{data_config} is expected data config for {model_config} but got {base_config.data}"
         assert (
-            config_cls_name == "Llama3Config8B"
-        ), "the name of the config class for the Llama3 8B model should be 'Llama3Config8B'."
-
-    def test_mixtral_base_config(self):
-        model_cls = getattr(base_configs, "Mixtral")
-
-        # Mixtral 8x7B
-        model_7b = model_cls(size=7)
-        config_cls = model_7b.get_model_config()
+            base_config.trainer == trainer_config
+        ), f"f{trainer_config} is expected trainer config for {model_config} but got {base_config.trainer}"
+        assert (
+            base_config.optim == optim_config
+        ), f"f{optim_config} is expected trainer config for {model_config} but got {base_config.optim}"
         assert (
-            config_cls.__class__.__name__ == "MixtralConfig8x7B"
-        ), "the name of the config class for the Mixtral 8x7B model should be 'MixtralConfig8x7B'."
+            base_config.log == logger_config
+        ), f"f{logger_config} is expected trainer config for {model_config} but got {logger_config}"
 
     def test_mistral_base_config(self):
-        model_cls = getattr(base_configs, "Mistral")
-
         # Mistral 7B
-        model_7b = model_cls(size=7, cfg={"nemo_sdk": True})
-        config_cls = model_7b.get_model_config()
-        config_cls_name = get_class_name(config_cls)
-        assert (
-            config_cls_name == "MistralConfig7B"
-        ), "the name of the config class for the Mistral 7B model should be 'MistralConfig7B'."
-
-    def test_basic_base_config(self):
-        model_cls = getattr(base_configs.basic, "Basic")
-
-        # Basic model class
-        model = model_cls(measure="M")
-
-        assert model.name == None
-        assert model.version == None
-        assert model.size == None
-        assert model.measure == "M"
-        assert model.cfg == {}
-
-    def test_custom_base_config(self):
-        model = base_configs.custom(name="Llama", cfg={})
-
-        assert model.name == "Llama"
-        assert model.version == 2
-        assert model.size == 7
-        assert model.measure == "B"
-        assert model.cfg == {}
-
-    def test_trainer_config(self):
-        model_cls = getattr(base_configs, "GPT")
-
-        model_126m = model_cls(size=126, measure="M")
-        trainer_config_source = model_126m.get_trainer_config()
-
-        trainer_config_target = {
-            "accelerator": "gpu",
-            "logger": False,
-            "enable_checkpointing": False,
-            "use_distributed_sampler": False,
-            "max_epochs": None,
-            "log_every_n_steps": 1,
-            "limit_val_batches": 1,
-            "limit_test_batches": 1,
-            "accumulate_grad_batches": 1,
-            "gradient_clip_val": 1.0,
-            "num_nodes": None,
-            "devices": None,
-            "max_steps": None,
-            "val_check_interval": None,
-        }
+        model_config = run.Config(MistralConfig7B)
+        runner = AutoConfigurator(
+            model=model_config,
+            num_nodes=16,
+            path_to_logs="/",
+            data_paths="/",
+            seq_length=32768,
+            global_batch_size=2048,
+        )
+        base_config = BaseConfig(runner)
+        model_size = runner._get_model_size(model_config)
+        model_type = runner._get_model_type(model_config)
+        data_config = get_data(32768, 2048)
+        trainer_config = get_trainer(16)
+        optim_config = get_optim()
+        logger_config = get_logger()
 
         assert (
-            trainer_config_target == trainer_config_source
-        ), f"{trainer_config_target} is expected trainer config but got {trainer_config_source}"
-
-    def test_data_config(self):
-        model_cls = getattr(base_configs, "Llama")
-
-        model_70b = model_cls(size=70)
-        data_config_source = model_70b.get_data_config()
-
-        data_config_target = {
-            "paths": None,
-            "seq_length": None,
-            "global_batch_size": None,
-            "num_workers": 2,
-            "split": "99990,8,2",
-            "index_mapping_dir": None,
-        }
-
+            base_config.model == model_config
+        ), f"{model_config} is expected class object but got {base_config.model}"
         assert (
-            data_config_target == data_config_source
-        ), f"{data_config_target} is expected data config but got {data_config_source}"
-
-    def test_optim_config(self):
-        model_cls = getattr(base_configs, "Mixtral")
-
-        model_7b = model_cls(size=7)
-        optim_config_source = model_7b.get_optim_config()
-
-        optim_config_target = OptimizerConfig(
-            optimizer='adam',
-            lr=1e-4,
-            min_lr=1e-5,
-            use_distributed_optimizer=True,
-            bf16=True,
-            adam_beta1=0.9,
-            adam_beta2=0.95,
-            overlap_grad_reduce=False,
-            overlap_param_gather=True,
+            model_size == 7
+        ), f"7 is expected size for {model_config} but got {model_size}"
+        assert (
+            model_type == "mistral"
+        ), f"mistral is expected model type for {model_config} but got {model_type}"
+        assert (
+            base_config.data == data_config
+        ), f"f{data_config} is expected data config for {model_config} but got {base_config.data}"
+        assert (
+            base_config.trainer == trainer_config
+        ), f"f{trainer_config} is expected trainer config for {model_config} but got {base_config.trainer}"
+        assert (
+            base_config.optim == optim_config
+        ), f"f{optim_config} is expected trainer config for {model_config} but got {base_config.optim}"
+        assert (
+            base_config.log == logger_config
+        ), f"f{logger_config} is expected trainer config for {model_config} but got {logger_config}"
+    
+    def test_mixtral_base_config(self):
+        # Mixtral 8x3B
+        model_config = run.Config(MixtralConfig8x3B)
+        runner = AutoConfigurator(
+            model=model_config,
+            num_nodes=16,
+            path_to_logs="/",
+            data_paths="/",
+            seq_length=4096,
+            global_batch_size=2048,
         )
+        base_config = BaseConfig(runner)
+        model_size = runner._get_model_size(model_config)
+        model_type = runner._get_model_type(model_config)
+        data_config = get_data(4096, 2048)
+        trainer_config = get_trainer(16)
+        optim_config = get_optim()
+        logger_config = get_logger()
 
         assert (
-            optim_config_target == optim_config_source
-        ), f"{optim_config_target} is expected optim config but got {optim_config_source}"
-
-    def test_optim_config_nemo_sdk(self):
-        model_cls = getattr(base_configs, "Mixtral")
-
-        model_7b = model_cls(size=7, cfg={"nemo_sdk": True})
-        optim_config_source = model_7b.get_optim_config()
-
-        optim_config_target = Config(
-            OptimizerConfig,
-            optimizer='adam',
-            lr=1e-4,
-            min_lr=1e-5,
-            use_distributed_optimizer=True,
-            bf16=True,
-            adam_beta1=0.9,
-            adam_beta2=0.95,
-            overlap_grad_reduce=False,
-            overlap_param_gather=True,
+            base_config.model == model_config
+        ), f"{model_config} is expected class object but got {base_config.model}"
+        assert (
+            model_size == 3
+        ), f"3 is expected size for {model_config} but got {model_size}"
+        assert (
+            model_type == "mixtral"
+        ), f"mixtral is expected model type for {model_config} but got {model_type}"
+        assert (
+            base_config.data == data_config
+        ), f"f{data_config} is expected data config for {model_config} but got {base_config.data}"
+        assert (
+            base_config.trainer == trainer_config
+        ), f"f{trainer_config} is expected trainer config for {model_config} but got {base_config.trainer}"
+        assert (
+            base_config.optim == optim_config
+        ), f"f{optim_config} is expected trainer config for {model_config} but got {base_config.optim}"
+        assert (
+            base_config.log == logger_config
+        ), f"f{logger_config} is expected trainer config for {model_config} but got {logger_config}"
+    
+    def test_gemma_base_config(self):
+        # Gemma 2B
+        model_config = run.Config(GemmaConfig2B)
+        runner = AutoConfigurator(
+            model=model_config,
+            num_nodes=8,
+            path_to_logs="/",
+            data_paths="/",
+            seq_length=4096,
+            global_batch_size=1024,
         )
+        base_config = BaseConfig(runner)
+        model_size = runner._get_model_size(model_config)
+        model_type = runner._get_model_type(model_config)
+        data_config = get_data(4096, 1024)
+        trainer_config = get_trainer(8)
+        optim_config = get_optim()
+        logger_config = get_logger()
 
         assert (
-            optim_config_target == optim_config_source
-        ), f"{optim_config_target} is expected optim config but got {optim_config_source}"
-
-    def test_run_config(self):
-        model_cls = getattr(base_configs, "Mistral")
-
-        model_7b = model_cls(size=7)
-        run_config_source = model_7b.get_run_config()
-
-        run_config_target = {
-            "name": f"Mistral_7B",
-            "results_dir": None,
-            "time_limit": "0-00:30:00",
-        }
+            base_config.model == model_config
+        ), f"{model_config} is expected class object but got {base_config.model}"
+        assert (
+            model_size == 2
+        ), f"2 is expected size for {model_config} but got {model_size}"
+        assert (
+            model_type == "gemma"
+        ), f"gemma is expected model type for {model_config} but got {model_type}"
+        assert (
+            base_config.data == data_config
+        ), f"f{data_config} is expected data config for {model_config} but got {base_config.data}"
+        assert (
+            base_config.trainer == trainer_config
+        ), f"f{trainer_config} is expected trainer config for {model_config} but got {base_config.trainer}"
+        assert (
+            base_config.optim == optim_config
+        ), f"f{optim_config} is expected trainer config for {model_config} but got {base_config.optim}"
+        assert (
+            base_config.log == logger_config
+        ), f"f{logger_config} is expected trainer config for {model_config} but got {logger_config}"
+    
+    def test_nemotron_base_config(self):
+        # Nemotron 22B
+        model_config = run.Config(Nemotron4Config22B)
+        runner = AutoConfigurator(
+            model=model_config,
+            num_nodes=64,
+            path_to_logs="/",
+            data_paths="/",
+            seq_length=4096,
+            global_batch_size=2048,
+        )
+        base_config = BaseConfig(runner)
+        model_size = runner._get_model_size(model_config)
+        model_type = runner._get_model_type(model_config)
+        data_config = get_data(4096, 2048)
+        trainer_config = get_trainer(64)
+        optim_config = get_optim()
+        logger_config = get_logger()
 
         assert (
-            run_config_target == run_config_source
-        ), f"{run_config_target} is expected run config but got {run_config_source}"
+            base_config.model == model_config
+        ), f"{model_config} is expected class object but got {base_config.model}"
+        assert (
+            model_size == 22
+        ), f"22 is expected size for {model_config} but got {model_size}"
+        assert (
+            model_type == "nemotron"
+        ), f"nemotron is expected model type for {model_config} but got {model_type}"
+        assert (
+            base_config.data == data_config
+        ), f"f{data_config} is expected data config for {model_config} but got {base_config.data}"
+        assert (
+            base_config.trainer == trainer_config
+        ), f"f{trainer_config} is expected trainer config for {model_config} but got {base_config.trainer}"
+        assert (
+            base_config.optim == optim_config
+        ), f"f{optim_config} is expected trainer config for {model_config} but got {base_config.optim}"
+        assert (
+            base_config.log == logger_config
+        ), f"f{logger_config} is expected trainer config for {model_config} but got {logger_config}"
\ No newline at end of file
diff --git a/tests/collections/llm/auto_conf/test_generate_configs.py b/tests/collections/llm/auto_conf/test_generate_configs.py
index 5910e3dca39c..6386a2f09c7a 100644
--- a/tests/collections/llm/auto_conf/test_generate_configs.py
+++ b/tests/collections/llm/auto_conf/test_generate_configs.py
@@ -1,28 +1,40 @@
-from nemo.collections.llm.tools.auto_configurator import AutoConfigurator
+import nemo_run as run
 
+from nemo.collections.llm.tools.auto_configurator import AutoConfigurator, generate_configs
+from nemo.collections.llm import (
+    GPTConfig5B,
+    Llama3Config70B,
+    MistralConfig7B,
+    MixtralConfig8x22B,
+    GemmaConfig7B,
+    Nemotron3Config8B,
+)
 
-def get_auto_config(configs):
-    auto_configs = []
-    for config in configs.values():
-        auto_conf_values = config['auto_config'].values()
-        auto_configs.append(list(auto_conf_values))
 
-    global_batch_size = config['model'].global_batch_size
-    seq_length = config['model'].seq_length
+def get_auto_configs(configs):
+    auto_configs = []
+    for run_name, config in configs.items():
+        auto_configs.append(
+            [
+                config.trainer.strategy.tensor_model_parallel_size,
+                config.trainer.strategy.pipeline_model_parallel_size,
+                config.trainer.strategy.context_parallel_size,
+                config.trainer.strategy.expert_model_parallel_size,
+                config.data.micro_batch_size,
+            ]
+        )
 
-    return auto_configs, global_batch_size, seq_length
+    return auto_configs
 
 
 class TestGenerateConfgis:
     def test_gpt_model(self):
         # GPT3 126M
         runner = AutoConfigurator(
-            model_type="gpt3",
-            model_size=126,
-            model_measure="M",
-            num_nodes=8,
-            seq_length=512,
-            global_batch_size=256,
+            model=run.Config(GPTConfig5B),
+            num_nodes=16,
+            seq_length=2048,
+            global_batch_size=2048,
             tensor_parallel_sizes=[4],
             pipeline_parallel_sizes=[2],
             micro_batch_sizes=[1, 2],
@@ -30,19 +42,21 @@ def test_gpt_model(self):
             expert_parallel_sizes=[1],
             min_model_parallel_size=8,
             max_model_parallel_size=8,
-            data_paths=[""],
+            data_paths="/",
+            path_to_logs="/",
         )
 
-        configs = runner.generate_configs()
-        auto_configs, global_batch_size, seq_length = get_auto_config(configs)
+        _, configs = generate_configs(runner)
 
-        for run_name, config in configs.items():
-            assert config['data']['micro_batch_size'] == config['auto_config']['micro_batch_size']
-            assert config['data']['seq_length'] == 512
-            assert config['data']['global_batch_size'] == 256
+        mbs = [1, 2]
+        for run_name, config, mb in zip(configs.keys(), configs.values(), mbs):
+            assert config.data.micro_batch_size == mb
+            assert config.data.seq_length == 2048
+            assert config.data.global_batch_size == 2048
 
-        assert len(auto_configs) == 2, f"{len(auto_configs)} configurations were generated but 2 were expected."
+        assert len(configs) == 2, f"{len(configs)} configurations were generated but 2 were expected."
 
+        auto_configs = get_auto_configs(configs)
         assert auto_configs[0] == [
             4,
             2,
@@ -54,393 +68,240 @@ def test_gpt_model(self):
         assert auto_configs[1] == [
             4,
             2,
-            2,
             1,
             1,
-        ], f"[4, 2, 2, 1, 1] is expected configuration output but got {auto_configs[1]}."
-
-        assert global_batch_size == 256, f"expected global_batch_size is 256 but got {global_batch_size}."
-
-        assert seq_length == 512, f"expected seq_length is 512 but got {seq_length}."
-
-        # GPT3 20B
+            2,
+        ], f"[4, 2, 1, 1, 2] is expected configuration output but got {auto_configs[1]}."
+    
+    def test_llama_model(self):
+        # Llama3 70B
         runner = AutoConfigurator(
-            model_type="gpt3",
-            model_size=20,
-            num_nodes=64,
-            seq_length=2048,
+            model=run.Config(Llama3Config70B),
+            num_nodes=128,
+            seq_length=8192,
             global_batch_size=2048,
+            tensor_parallel_sizes="auto",
+            pipeline_parallel_sizes="auto",
             micro_batch_sizes=[1],
-            context_parallel_sizes=[1],
+            context_parallel_sizes=[1, 2, 4],
             expert_parallel_sizes=[1],
             min_model_parallel_size=16,
-            max_model_parallel_size=32,
-            max_training_days=8,
-            data_paths=[""],
-        )
-
-        configs = runner.generate_configs()
-        auto_configs, _, _ = get_auto_config(configs)
-
-        for run_name, config in configs.items():
-            assert config['data']['micro_batch_size'] == config['auto_config']['micro_batch_size']
-            assert config['data']['seq_length'] == 2048
-            assert config['data']['global_batch_size'] == 2048
-
-        assert len(auto_configs) == 1, f"{len(auto_configs)} configurations were generated but 1 were expected."
-
-        assert auto_configs[0] == [
-            11,
-            4,
-            4,
-            1,
-            1,
-            1,
-        ], f"[11, 4, 4, 1, 1, 1] is expected configuration output but got {auto_configs[0]}."
-
-        # GPT3 175B
-        runner = AutoConfigurator(
-            model_type="gpt3",
-            model_size=175,
-            num_nodes=128,
-            seq_length=2048,
-            global_batch_size=2048,
-            context_parallel_sizes=[1],
-            expert_parallel_sizes=[1],
-            min_model_parallel_size=64,
             max_model_parallel_size=64,
-            max_training_days=16,
-            data_paths=[""],
+            data_paths="/",
+            path_to_logs="/",
         )
 
-        configs = runner.generate_configs()
-        auto_configs, _, _ = get_auto_config(configs)
+        _, configs = generate_configs(runner)
 
-        for run_name, config in configs.items():
-            assert config['data']['micro_batch_size'] == config['auto_config']['micro_batch_size']
-            assert config['data']['seq_length'] == 2048
-            assert config['data']['global_batch_size'] == 2048
+        mbs = [1, 1, 1]
+        for run_name, config, mb in zip(configs.keys(), configs.values(), mbs):
+            assert config.data.micro_batch_size == mb
+            assert config.data.seq_length == 8192
+            assert config.data.global_batch_size == 2048
 
-        assert len(auto_configs) == 3, f"{len(auto_configs)} configurations were generated but 3 were expected."
+        assert len(configs) == 3, f"{len(configs)} configurations were generated but 3 were expected."
 
+        auto_configs = get_auto_configs(configs)
         assert auto_configs[0] == [
-            12,
-            8,
-            8,
+            4,
             1,
+            4,
             1,
             1,
-        ], f"[12, 8, 8, 1, 1, 1] is expected configuration output but got {auto_configs[0]}."
+        ], f"[4, 1, 4, 1, 1] is expected configuration output but got {auto_configs[0]}."
 
         assert auto_configs[1] == [
-            12,
-            8,
             8,
+            1,
             2,
             1,
             1,
-        ], f"[12, 8, 8, 2, 1, 1] is expected configuration output but got {auto_configs[1]}."
+        ], f"[8, 1, 2, 1, 1] is expected configuration output but got {auto_configs[1]}."
 
         assert auto_configs[2] == [
-            12,
-            8,
             8,
+            1,
             4,
             1,
             1,
-        ], f"[12, 8, 8, 4, 1, 1] is expected configuration output but got {auto_configs[2]}."
-
-    def test_llama_model(self):
-        # Llama2 7B
+        ], f"[8, 1, 4, 1, 1] is expected configuration output but got {auto_configs[2]}."
+    
+    def test_mistral_model(self):
+        # Mistral 7B
         runner = AutoConfigurator(
-            model_type="llama",
-            model_size=7,
-            model_version=2,
+            model=run.Config(MistralConfig7B),
             num_nodes=16,
             seq_length=4096,
             global_batch_size=2048,
-            tensor_parallel_sizes=[1],
-            pipeline_parallel_sizes=[1],
+            tensor_parallel_sizes=[4],
+            pipeline_parallel_sizes=[1, 2],
             micro_batch_sizes=[1],
-            context_parallel_sizes=[1, 2],
+            context_parallel_sizes=[1],
             expert_parallel_sizes=[1],
-            min_model_parallel_size=1,
-            max_model_parallel_size=16,
-            max_training_days=8,
-            data_paths=[""],
+            min_model_parallel_size=4,
+            max_model_parallel_size=8,
+            data_paths="/",
+            path_to_logs="/",
         )
 
-        configs = runner.generate_configs()
-        auto_configs, _, _ = get_auto_config(configs)
+        _, configs = generate_configs(runner)
 
-        for run_name, config in configs.items():
-            assert config['data']['micro_batch_size'] == config['auto_config']['micro_batch_size']
-            assert config['data']['seq_length'] == 4096
-            assert config['data']['global_batch_size'] == 2048
+        mbs = [1, 1]
+        for run_name, config, mb in zip(configs.keys(), configs.values(), mbs):
+            assert config.data.micro_batch_size == mb
+            assert config.data.seq_length == 4096
+            assert config.data.global_batch_size == 2048
 
-        assert len(auto_configs) == 2, f"{len(auto_configs)} configurations were generated but 2 were expected."
+        assert len(configs) == 2, f"{len(configs)} configurations were generated but 3 were expected."
 
+        auto_configs = get_auto_configs(configs)
         assert auto_configs[0] == [
+            4,
             1,
             1,
             1,
             1,
-            1,
-        ], f"[1, 1, 1, 1, 1] is expected configuration output but got {auto_configs[0]}."
+        ], f"[4, 1, 1, 1, 1] is expected configuration output but got {auto_configs[0]}."
 
         assert auto_configs[1] == [
+            4,
+            2,
             1,
             1,
             1,
-            2,
-            1,
-        ], f"[1, 1, 1, 2, 1] is expected configuration output but got {auto_configs[1]}."
-
-        # Llama3 8B
+        ], f"[4, 2, 1, 1, 1] is expected configuration output but got {auto_configs[1]}."
+    
+    def test_mixtral_model(self):
+        # Mixtral 8x22B
         runner = AutoConfigurator(
-            model_type="llama",
-            model_size=8,
-            model_version=3,
+            model=run.Config(MixtralConfig8x22B),
             num_nodes=16,
-            seq_length=8192,
-            global_batch_size=2048,
-            tensor_parallel_sizes=[2],
-            pipeline_parallel_sizes=[2],
-            micro_batch_sizes=[2],
-            context_parallel_sizes=[2],
-            expert_parallel_sizes=[1, 2, 4],
-            min_model_parallel_size=1,
-            max_model_parallel_size=16,
-            max_training_days=8,
-            data_paths=[""],
-        )
-
-        configs = runner.generate_configs()
-        auto_configs, _, _ = get_auto_config(configs)
-
-        for run_name, config in configs.items():
-            assert config['data']['micro_batch_size'] == config['auto_config']['micro_batch_size']
-            assert config['data']['seq_length'] == 8192
-            assert config['data']['global_batch_size'] == 2048
-
-        assert len(auto_configs) == 1, f"{len(auto_configs)} configurations were generated but 1 were expected."
-
-        assert auto_configs[0] == [
-            2,
-            2,
-            2,
-            2,
-            1,
-        ], f"[2, 2, 2, 2, 1] is expected configuration output but got {auto_configs[0]}."
-
-        # Llama3 70B
-        runner = AutoConfigurator(
-            model_type="llama",
-            model_size=70,
-            model_version=3,
-            num_nodes=64,
-            seq_length=8192,
+            seq_length=4096,
             global_batch_size=2048,
-            tensor_parallel_sizes=[1, 2],
-            pipeline_parallel_sizes=[1, 2],
+            tensor_parallel_sizes=[4],
+            pipeline_parallel_sizes=[1],
             micro_batch_sizes=[1],
-            context_parallel_sizes=[2],
-            expert_parallel_sizes=[1, 2, 4],
-            min_model_parallel_size=1,
-            max_model_parallel_size=4,
-            max_training_days=30,
-            data_paths=[""],
+            context_parallel_sizes=[1],
+            expert_parallel_sizes=[1, 2],
+            min_model_parallel_size=4,
+            max_model_parallel_size=8,
+            data_paths="/",
+            path_to_logs="/",
         )
 
-        configs = runner.generate_configs()
-        auto_configs, global_batch_size, seq_length = get_auto_config(configs)
+        _, configs = generate_configs(runner)
 
-        for run_name, config in configs.items():
-            assert config['data']['micro_batch_size'] == config['auto_config']['micro_batch_size']
-            assert config['data']['seq_length'] == 8192
-            assert config['data']['global_batch_size'] == 2048
+        mbs = [1, 1]
+        for run_name, config, mb in zip(configs.keys(), configs.values(), mbs):
+            assert config.data.micro_batch_size == mb
+            assert config.data.seq_length == 4096
+            assert config.data.global_batch_size == 2048
 
-        assert len(auto_configs) == 3, f"{len(auto_configs)} configurations were generated but 3 were expected."
+        assert len(configs) == 2, f"{len(configs)} configurations were generated but 3 were expected."
 
+        auto_configs = get_auto_configs(configs)
         assert auto_configs[0] == [
+            4,
             1,
             1,
             1,
-            2,
             1,
-        ], f"[1, 1, 1, 2, 1] is expected configuration output but got {auto_configs[0]}."
+        ], f"[4, 1, 1, 1, 1] is expected configuration output but got {auto_configs[0]}."
 
         assert auto_configs[1] == [
-            1,
-            2,
-            1,
-            2,
-            1,
-        ], f"[1, 2, 1, 2, 1] is expected configuration output but got {auto_configs[1]}."
-
-        assert auto_configs[2] == [
-            2,
+            4,
             1,
             1,
             2,
             1,
-        ], f"[2, 1, 1, 2, 1] is expected configuration output but got {auto_configs[2]}."
-
-        assert global_batch_size == 2048, f"expected global_batch_size is 2048 but got {global_batch_size}."
-
-        assert seq_length == 8192, f"expected seq_length is 8192 but got {seq_length}."
+        ], f"[4, 1, 1, 2, 1] is expected configuration output but got {auto_configs[1]}."
 
-    def test_mixtral_model(self):
-        # Mixtral 8x7B
+    def test_gemma_model(self):
+        # Gemma 7B
         runner = AutoConfigurator(
-            model_type="mixtral",
-            model_size=7,
-            model_version=8,
+            model=run.Config(GemmaConfig7B),
             num_nodes=16,
-            seq_length=4096,
+            seq_length=8192,
             global_batch_size=2048,
-            tensor_parallel_sizes=[2, 3, 4],
-            micro_batch_sizes=[2],
-            expert_parallel_sizes=[2, 4],
-            data_paths=[""],
+            tensor_parallel_sizes=[2],
+            pipeline_parallel_sizes=[2],
+            micro_batch_sizes=[1, 2],
+            context_parallel_sizes=[1],
+            expert_parallel_sizes=[1],
+            min_model_parallel_size=4,
+            max_model_parallel_size=8,
+            data_paths="/",
+            path_to_logs="/",
         )
 
-        configs = runner.generate_configs()
-        auto_configs, global_batch_size, seq_length = get_auto_config(configs)
+        _, configs = generate_configs(runner)
 
-        for run_name, config in configs.items():
-            assert config['data']['micro_batch_size'] == config['auto_config']['micro_batch_size']
-            assert config['data']['seq_length'] == 4096
-            assert config['data']['global_batch_size'] == 2048
+        mbs = [1, 2]
+        for run_name, config, mb in zip(configs.keys(), configs.values(), mbs):
+            assert config.data.micro_batch_size == mb
+            assert config.data.seq_length == 8192
+            assert config.data.global_batch_size == 2048
 
-        assert len(auto_configs) == 4, f"{len(auto_configs)} configurations were generated but 4 were expected."
+        assert len(configs) == 2, f"{len(configs)} configurations were generated but 3 were expected."
 
+        auto_configs = get_auto_configs(configs)
         assert auto_configs[0] == [
             2,
-            1,
             2,
             1,
-            2,
-        ], f"[2, 1, 2, 1, 2] is expected configuration output but got {auto_configs[0]}."
-
-        assert auto_configs[1] == [
-            2,
             1,
-            2,
             1,
-            4,
-        ], f"[2, 1, 2, 1, 4] is expected configuration output but got {auto_configs[1]}."
+        ], f"[2, 2, 1, 1, 1] is expected configuration output but got {auto_configs[0]}."
 
-        assert auto_configs[2] == [
-            2,
-            2,
+        assert auto_configs[1] == [
             2,
-            1,
             2,
-        ], f"[2, 2, 2, 1, 2] is expected configuration output but got {auto_configs[2]}."
-
-        assert auto_configs[3] == [
-            4,
             1,
-            2,
             1,
             2,
-        ], f"[4, 1, 2, 1, 2] is expected configuration output but got {auto_configs[3]}."
-
-        assert global_batch_size == 2048, f"expected global_batch_size is 2048 but got {global_batch_size}."
-
-        assert seq_length == 4096, f"expected seq_length is 4096 but got {seq_length}."
+        ], f"[2, 2, 1, 1, 2] is expected configuration output but got {auto_configs[1]}."
 
-    def test_mistral_model(self):
-        # Mistral 7B
+    def test_nemotron_model(self):
+        # Nemotron3 8B
         runner = AutoConfigurator(
-            model_type="mistral",
-            model_size=7,
+            model=run.Config(Nemotron3Config8B),
             num_nodes=16,
-            seq_length=16384,
+            seq_length=4096,
             global_batch_size=2048,
-            tensor_parallel_sizes=[1, 2, 3],
-            pipeline_parallel_sizes=[2, 11, 17],
-            micro_batch_sizes=[1, 256],
-            expert_parallel_sizes=[2, 13],
-            data_paths=[""],
+            tensor_parallel_sizes=[1],
+            pipeline_parallel_sizes=[4],
+            micro_batch_sizes=[1, 2],
+            context_parallel_sizes=[1],
+            expert_parallel_sizes=[1],
+            min_model_parallel_size=4,
+            max_model_parallel_size=8,
+            data_paths="/",
+            path_to_logs="/",
         )
 
-        configs = runner.generate_configs()
-        auto_configs, global_batch_size, seq_length = get_auto_config(configs)
+        _, configs = generate_configs(runner)
 
-        for run_name, config in configs.items():
-            assert config['data']['micro_batch_size'] == config['auto_config']['micro_batch_size']
-            assert config['data']['seq_length'] == 16384
-            assert config['data']['global_batch_size'] == 2048
+        mbs = [1, 2]
+        for run_name, config, mb in zip(configs.keys(), configs.values(), mbs):
+            assert config.data.micro_batch_size == mb
+            assert config.data.seq_length == 4096
+            assert config.data.global_batch_size == 2048
 
-        assert len(auto_configs) == 2, f"{len(auto_configs)} configurations were generated but 2 were expected."
+        assert len(configs) == 2, f"{len(configs)} configurations were generated but 3 were expected."
 
+        auto_configs = get_auto_configs(configs)
         assert auto_configs[0] == [
             1,
-            2,
-            1,
+            4,
             1,
-            2,
-        ], f"[1, 2, 1, 1, 2] is expected configuration output but got {auto_configs[0]}."
-
-        assert auto_configs[1] == [
-            2,
-            2,
             1,
             1,
-            2,
-        ], f"[2, 2, 1, 1, 2] is expected configuration output but got {auto_configs[1]}."
-
-        assert global_batch_size == 2048, f"expected global_batch_size is 2048 but got {global_batch_size}."
+        ], f"[2, 2, 1, 1, 1] is expected configuration output but got {auto_configs[0]}."
 
-        assert seq_length == 16384, f"expected seq_length is 16384 but got {seq_length}."
-
-    def test_custom_model(self):
-        # Custom 1B
-        runner = AutoConfigurator(
-            model_type="llama",
-            num_nodes=4,
-            seq_length=512,
-            tensor_parallel_sizes=[1, 2],
-            pipeline_parallel_sizes=[2, 4],
-            micro_batch_sizes=[1, 256],
-            context_parallel_sizes=[2, 22],
-            expert_parallel_sizes=[1, 13],
-            min_model_parallel_size=2,
-            max_model_parallel_size=8,
-            vocab_size=32000,
-            max_training_days=7,
-            custom_model=True,
-            data_paths=[""],
-        )
-
-        configs = runner.generate_configs()
-        auto_configs, global_batch_size, seq_length = get_auto_config(configs)
-
-        for run_name, config in configs.items():
-            assert config['data']['micro_batch_size'] == config['auto_config']['micro_batch_size']
-            assert config['data']['seq_length'] == 512
-            assert config['data']['global_batch_size'] == 1024
-
-        assert len(auto_configs) == 2, f"{len(auto_configs)} configurations were generated but 2 were expected."
-        print(auto_configs)
-        assert auto_configs[0] == [
-            1,
-            2,
+        assert auto_configs[1] == [
             1,
-            2,
+            4,
             1,
-        ], f"[1, 2, 1, 2, 1] is expected configuration output but got {auto_configs[0]}."
-
-        assert auto_configs[1] == [
-            2,
-            2,
             1,
             2,
-            1,
-        ], f"[2, 2, 1, 2, 1] is expected configuration output but got {auto_configs[1]}."
-
-        assert global_batch_size == 1024, f"expected global_batch_size is 1024 but got {global_batch_size}."
-
-        assert seq_length == 512, f"expected seq_length is 512 but got {seq_length}."
+        ], f"[2, 2, 1, 1, 2] is expected configuration output but got {auto_configs[1]}."
\ No newline at end of file
diff --git a/tests/collections/llm/auto_conf/test_utils.py b/tests/collections/llm/auto_conf/test_utils.py
index 6809b2a84cd6..339c6aea0ee5 100644
--- a/tests/collections/llm/auto_conf/test_utils.py
+++ b/tests/collections/llm/auto_conf/test_utils.py
@@ -1,5 +1,4 @@
 from nemo.collections.llm.tools.auto_configurator.core.base_config import _estimate_training_time, calculate_model_size
-from nemo.collections.llm.tools.auto_configurator.core.utils import calculate_model_size_params
 
 
 class TestUtils:
@@ -48,6 +47,28 @@ def test_calculate_model_size(self):
         )
         assert model_size == 799.37, f"expected model_size is 799.37 but got {model_size}."
 
+        # Gemma
+        model_size = calculate_model_size(
+            512,
+            30,
+            None,
+            240,
+            100,
+            "gemma",
+        )
+        assert model_size == 398.13, f"expected model_size is 398.13 but got {model_size}."
+
+        # Nemotron
+        model_size = calculate_model_size(
+            256,
+            15,
+            None,
+            240,
+            120,
+            "gemma",
+        )
+        assert model_size == 82.94, f"expected model_size is 82.94 but got {model_size}."
+
     def test_calculate_train_time(self):
         # GPT
         train_time = _estimate_training_time(
@@ -89,67 +110,22 @@ def test_calculate_train_time(self):
         )
         assert train_time == 176.83, f"expected train_time is 176.83 but got {train_time}."
 
-    def test_calculate_model_params(self):
-        # GPT
-        params = calculate_model_size_params(
-            40,
-            51200,
-            2048,
-            "gpt3",
-        )
-        assert params == (
-            48,
-            8192,
-            64,
-            None,
-            None,
-            8e-05,
-        ), f"expected model_params set is (48, 8192, 64, None, None, 8e-05) but got {params}."
-
-        # Llama
-        params = calculate_model_size_params(
-            70,
-            32000,
-            8192,
-            "llama",
-        )
-        assert params == (
-            56,
-            10240,
-            80,
-            None,
-            None,
-            7e-05,
-        ), f"expected model_params set is (56, 10240, 80, None, None, 7e-05) but got {params}."
-
-        # Mixtral
-        params = calculate_model_size_params(
-            30,
-            32000,
-            4096,
-            "mixtral",
+        # Gemma
+        train_time = _estimate_training_time(
+            7,
+            8,
+            55,
+            100,
+            "gemma",
         )
-        assert params == (
-            36,
-            8192,
-            64,
-            None,
-            None,
-            8e-05,
-        ), f"expected model_params set is (36, 8192, 64, None, None, 8e-05) but got {params}."
+        assert train_time == 147.31, f"expected train_time is 147.31 but got {train_time}."
 
-        # Mistral
-        params = calculate_model_size_params(
-            0.5,
-            32000,
-            4096,
-            "mistral",
+        # Nemotron
+        train_time = _estimate_training_time(
+            14,
+            12,
+            11,
+            55,
+            "nemotron",
         )
-        assert params == (
-            16,
-            1536,
-            16,
-            None,
-            None,
-            0.00025,
-        ), f"expected model_params set is (16, 1536, 16, None, None, 0.00025) but got {params}."
+        assert train_time == 540.12, f"expected train_time is 540.12 but got {train_time}."
\ No newline at end of file

From 7fd82cfa9f54ee902956f28ed5c8090dbb5b2cf7 Mon Sep 17 00:00:00 2001
From: dimapihtar <dimapihtar@users.noreply.github.com>
Date: Fri, 6 Sep 2024 12:28:59 +0000
Subject: [PATCH 53/63] Apply isort and black reformatting

Signed-off-by: dimapihtar <dimapihtar@users.noreply.github.com>
---
 .../llm/auto_conf/test_base_configs.py        | 75 +++++++------------
 .../llm/auto_conf/test_generate_configs.py    | 12 +--
 tests/collections/llm/auto_conf/test_utils.py |  2 +-
 3 files changed, 34 insertions(+), 55 deletions(-)

diff --git a/tests/collections/llm/auto_conf/test_base_configs.py b/tests/collections/llm/auto_conf/test_base_configs.py
index 2a93fdd89496..2e1a1aa5264e 100644
--- a/tests/collections/llm/auto_conf/test_base_configs.py
+++ b/tests/collections/llm/auto_conf/test_base_configs.py
@@ -1,31 +1,31 @@
 import re
+
 import nemo_run as run
 import torch
-
 from megatron.core.optimizer import OptimizerConfig
-
 from pytorch_lightning.loggers import TensorBoardLogger
 
-from nemo.collections.llm.tools.auto_configurator.core.base_config import BaseConfig
-from nemo.collections.llm.tools.auto_configurator import AutoConfigurator
+from nemo import lightning as nl
+from nemo.collections.common.tokenizers import AutoTokenizer
 from nemo.collections.llm import (
+    GemmaConfig2B,
     GPTConfig126M,
     Llama3Config8B,
     MistralConfig7B,
     MixtralConfig8x3B,
-    GemmaConfig2B,
     Nemotron4Config22B,
-    PreTrainingDataModule
+    PreTrainingDataModule,
 )
-from nemo.collections.common.tokenizers import AutoTokenizer
+from nemo.collections.llm.tools.auto_configurator import AutoConfigurator
+from nemo.collections.llm.tools.auto_configurator.core.base_config import BaseConfig
 from nemo.lightning.pytorch.optim import CosineAnnealingScheduler, MegatronOptimizerModule
 from nemo.utils.exp_manager import TimingCallback
-from nemo import lightning as nl
 
 
 def get_tokenizer() -> run.Config:
     return run.Config(AutoTokenizer, pretrained_model_name="GPT2BPETokenizer")
 
+
 def get_data(seq_length, global_batch_size) -> run.Config[PreTrainingDataModule]:
     config = {
         "paths": "/",
@@ -41,6 +41,7 @@ def get_data(seq_length, global_batch_size) -> run.Config[PreTrainingDataModule]
         tokenizer=get_tokenizer(),
     )
 
+
 def get_trainer(num_nodes) -> run.Config[nl.Trainer]:
     trainer_config = {
         "accelerator": "gpu",
@@ -70,6 +71,7 @@ def get_trainer(num_nodes) -> run.Config[nl.Trainer]:
         callbacks=[run.Config(TimingCallback)],
     )
 
+
 def get_optim() -> run.Config[OptimizerConfig]:
     optim_params = {
         "optimizer": "adam",
@@ -103,6 +105,7 @@ def get_optim() -> run.Config[OptimizerConfig]:
         lr_scheduler=sched,
     )
 
+
 def get_logger() -> run.Config[nl.NeMoLogger]:
     tb_logger = run.Config(TensorBoardLogger, save_dir="tb_logs")
 
@@ -139,12 +142,8 @@ def test_gpt3_base_config(self):
         assert (
             base_config.model == model_config
         ), f"{model_config} is expected class object but got {base_config.model}"
-        assert (
-            model_size == 0.126
-        ), f"0.126 is expected size for {model_config} but got {model_size}"
-        assert (
-            model_type == "gpt3"
-        ), f"gpt3 is expected model type for {model_config} but got {model_type}"
+        assert model_size == 0.126, f"0.126 is expected size for {model_config} but got {model_size}"
+        assert model_type == "gpt3", f"gpt3 is expected model type for {model_config} but got {model_type}"
         assert (
             base_config.data == data_config
         ), f"f{data_config} is expected data config for {model_config} but got {base_config.data}"
@@ -180,12 +179,8 @@ def test_llama_base_config(self):
         assert (
             base_config.model == model_config
         ), f"{model_config} is expected class object but got {base_config.model}"
-        assert (
-            model_size == 8
-        ), f"8 is expected size for {model_config} but got {model_size}"
-        assert (
-            model_type == "llama"
-        ), f"llama is expected model type for {model_config} but got {model_type}"
+        assert model_size == 8, f"8 is expected size for {model_config} but got {model_size}"
+        assert model_type == "llama", f"llama is expected model type for {model_config} but got {model_type}"
         assert (
             base_config.data == data_config
         ), f"f{data_config} is expected data config for {model_config} but got {base_config.data}"
@@ -221,12 +216,8 @@ def test_mistral_base_config(self):
         assert (
             base_config.model == model_config
         ), f"{model_config} is expected class object but got {base_config.model}"
-        assert (
-            model_size == 7
-        ), f"7 is expected size for {model_config} but got {model_size}"
-        assert (
-            model_type == "mistral"
-        ), f"mistral is expected model type for {model_config} but got {model_type}"
+        assert model_size == 7, f"7 is expected size for {model_config} but got {model_size}"
+        assert model_type == "mistral", f"mistral is expected model type for {model_config} but got {model_type}"
         assert (
             base_config.data == data_config
         ), f"f{data_config} is expected data config for {model_config} but got {base_config.data}"
@@ -239,7 +230,7 @@ def test_mistral_base_config(self):
         assert (
             base_config.log == logger_config
         ), f"f{logger_config} is expected trainer config for {model_config} but got {logger_config}"
-    
+
     def test_mixtral_base_config(self):
         # Mixtral 8x3B
         model_config = run.Config(MixtralConfig8x3B)
@@ -262,12 +253,8 @@ def test_mixtral_base_config(self):
         assert (
             base_config.model == model_config
         ), f"{model_config} is expected class object but got {base_config.model}"
-        assert (
-            model_size == 3
-        ), f"3 is expected size for {model_config} but got {model_size}"
-        assert (
-            model_type == "mixtral"
-        ), f"mixtral is expected model type for {model_config} but got {model_type}"
+        assert model_size == 3, f"3 is expected size for {model_config} but got {model_size}"
+        assert model_type == "mixtral", f"mixtral is expected model type for {model_config} but got {model_type}"
         assert (
             base_config.data == data_config
         ), f"f{data_config} is expected data config for {model_config} but got {base_config.data}"
@@ -280,7 +267,7 @@ def test_mixtral_base_config(self):
         assert (
             base_config.log == logger_config
         ), f"f{logger_config} is expected trainer config for {model_config} but got {logger_config}"
-    
+
     def test_gemma_base_config(self):
         # Gemma 2B
         model_config = run.Config(GemmaConfig2B)
@@ -303,12 +290,8 @@ def test_gemma_base_config(self):
         assert (
             base_config.model == model_config
         ), f"{model_config} is expected class object but got {base_config.model}"
-        assert (
-            model_size == 2
-        ), f"2 is expected size for {model_config} but got {model_size}"
-        assert (
-            model_type == "gemma"
-        ), f"gemma is expected model type for {model_config} but got {model_type}"
+        assert model_size == 2, f"2 is expected size for {model_config} but got {model_size}"
+        assert model_type == "gemma", f"gemma is expected model type for {model_config} but got {model_type}"
         assert (
             base_config.data == data_config
         ), f"f{data_config} is expected data config for {model_config} but got {base_config.data}"
@@ -321,7 +304,7 @@ def test_gemma_base_config(self):
         assert (
             base_config.log == logger_config
         ), f"f{logger_config} is expected trainer config for {model_config} but got {logger_config}"
-    
+
     def test_nemotron_base_config(self):
         # Nemotron 22B
         model_config = run.Config(Nemotron4Config22B)
@@ -344,12 +327,8 @@ def test_nemotron_base_config(self):
         assert (
             base_config.model == model_config
         ), f"{model_config} is expected class object but got {base_config.model}"
-        assert (
-            model_size == 22
-        ), f"22 is expected size for {model_config} but got {model_size}"
-        assert (
-            model_type == "nemotron"
-        ), f"nemotron is expected model type for {model_config} but got {model_type}"
+        assert model_size == 22, f"22 is expected size for {model_config} but got {model_size}"
+        assert model_type == "nemotron", f"nemotron is expected model type for {model_config} but got {model_type}"
         assert (
             base_config.data == data_config
         ), f"f{data_config} is expected data config for {model_config} but got {base_config.data}"
@@ -361,4 +340,4 @@ def test_nemotron_base_config(self):
         ), f"f{optim_config} is expected trainer config for {model_config} but got {base_config.optim}"
         assert (
             base_config.log == logger_config
-        ), f"f{logger_config} is expected trainer config for {model_config} but got {logger_config}"
\ No newline at end of file
+        ), f"f{logger_config} is expected trainer config for {model_config} but got {logger_config}"
diff --git a/tests/collections/llm/auto_conf/test_generate_configs.py b/tests/collections/llm/auto_conf/test_generate_configs.py
index 6386a2f09c7a..efb3bcf9a0ba 100644
--- a/tests/collections/llm/auto_conf/test_generate_configs.py
+++ b/tests/collections/llm/auto_conf/test_generate_configs.py
@@ -1,14 +1,14 @@
 import nemo_run as run
 
-from nemo.collections.llm.tools.auto_configurator import AutoConfigurator, generate_configs
 from nemo.collections.llm import (
+    GemmaConfig7B,
     GPTConfig5B,
     Llama3Config70B,
     MistralConfig7B,
     MixtralConfig8x22B,
-    GemmaConfig7B,
     Nemotron3Config8B,
 )
+from nemo.collections.llm.tools.auto_configurator import AutoConfigurator, generate_configs
 
 
 def get_auto_configs(configs):
@@ -72,7 +72,7 @@ def test_gpt_model(self):
             1,
             2,
         ], f"[4, 2, 1, 1, 2] is expected configuration output but got {auto_configs[1]}."
-    
+
     def test_llama_model(self):
         # Llama3 70B
         runner = AutoConfigurator(
@@ -125,7 +125,7 @@ def test_llama_model(self):
             1,
             1,
         ], f"[8, 1, 4, 1, 1] is expected configuration output but got {auto_configs[2]}."
-    
+
     def test_mistral_model(self):
         # Mistral 7B
         runner = AutoConfigurator(
@@ -170,7 +170,7 @@ def test_mistral_model(self):
             1,
             1,
         ], f"[4, 2, 1, 1, 1] is expected configuration output but got {auto_configs[1]}."
-    
+
     def test_mixtral_model(self):
         # Mixtral 8x22B
         runner = AutoConfigurator(
@@ -304,4 +304,4 @@ def test_nemotron_model(self):
             1,
             1,
             2,
-        ], f"[2, 2, 1, 1, 2] is expected configuration output but got {auto_configs[1]}."
\ No newline at end of file
+        ], f"[2, 2, 1, 1, 2] is expected configuration output but got {auto_configs[1]}."
diff --git a/tests/collections/llm/auto_conf/test_utils.py b/tests/collections/llm/auto_conf/test_utils.py
index 339c6aea0ee5..0faa86c13016 100644
--- a/tests/collections/llm/auto_conf/test_utils.py
+++ b/tests/collections/llm/auto_conf/test_utils.py
@@ -128,4 +128,4 @@ def test_calculate_train_time(self):
             55,
             "nemotron",
         )
-        assert train_time == 540.12, f"expected train_time is 540.12 but got {train_time}."
\ No newline at end of file
+        assert train_time == 540.12, f"expected train_time is 540.12 but got {train_time}."

From 83e537d1a9b9cdf81908bb1b0525623c85955ae2 Mon Sep 17 00:00:00 2001
From: dimapihtar <dpihtar@gmail.com>
Date: Fri, 6 Sep 2024 05:55:12 -0700
Subject: [PATCH 54/63] add README

Signed-off-by: dimapihtar <dpihtar@gmail.com>
---
 examples/llm/auto_configurator/README.md | 85 ++++++++++++++++++++++++
 1 file changed, 85 insertions(+)
 create mode 100644 examples/llm/auto_configurator/README.md

diff --git a/examples/llm/auto_configurator/README.md b/examples/llm/auto_configurator/README.md
new file mode 100644
index 000000000000..a96548b1700f
--- /dev/null
+++ b/examples/llm/auto_configurator/README.md
@@ -0,0 +1,85 @@
+=======================================================
+Use Auto Configurator to Find the Optimal Configuration
+=======================================================
+
+Auto Configurator searches for hyperparameters (HPs) that achieve the maximum highest training throughput when working with Large Language Models (LLMs) utilizing the NeMo Framework.
+
+.. note::
+   Auto Configurator is only supported now for GPT-based models: GPT3, LLama, Mixtral, Mistral, Gemma and Nemotron.
+
+Auto Configurator Capabilities
+------------------------------
+
+Auto Configurator is intended to iterate over different model configurations quickly and find the best configuration, that is, the configuration that minimizes both time and financial expenditure. It offers a range of features to facilitate this, as detailed in the list below.
+
+- Model size recommendation: finds the optimal model size if the parameter is not specified.
+- Training time estimation: estimates model training time based on input parameters.
+- Base configuration generation: returns a basic model configuration.
+- Hyperparameters recommendation: finds the optimal list of hyperparameters to be trained.
+- Optimal configuration recommendation: calculates the performance after a short training of candidate configurations and finds the optimal model configuration.
+
+Model Size Recommendation
+#########################
+
+If you have not decided what model size you want to train, Auto Configurator can recommend a model size for your use case. If you know the number of GPUs, TFLOPS per GPU, the maximum time to train, and the number of tokens to train for, it can recommend a model size that can be trained with the specified hardware and time constraints.
+
+For example, if you had 20 NVIDIA DGX nodes available (in 80\ |_GB| GPU memory), and wanted to train a GPT model for a maximum of 5\ |_days|, Auto Configurator would recommend using a 5B parameter GPT model.
+
+Training Time Estimation
+########################
+
+Auto Configurator calculates the estimated training time for your model. It provides a projection of the training time in days, based on the input dataset and parameters you provide.
+
+Base Configuration Generation
+#############################
+
+When you provide the model size, or Auto Configurator has suggested one, it generates a base configuration for the target model. The base configuration is a valid configuration in NeMo 2.0 format. The optimization of throughput, however, is conducted in the next step, :ref:`Training Auto Configurator HP Search <TrainingWithPredefinedConfigurations_TrainingAutoconfiguratorHpSearch>`.
+
+.. _TrainingWithPredefinedConfigurations_TrainingAutoconfiguratorHpSearch:
+
+Hyperparameters Recommendation
+##############################
+
+After Auto Configurator generates the base configuration, it searches over four critical hyperparameters that have a great impact on training throughput but do not affect model convergence. These hyperparameters include  Tensor Parallelism (TP), Pipeline Parallelism (PP), Context Parallelism (CP), Expert Parallelism (EP), Micro Batch Size (MBS), and Activation Checkpointing Layers (ActCkpt). Auto Configurator will also provide optimal Global Batch Size (GBS) if it's not specified.
+
+Auto Configurator initially applies heuristics to identify suitable candidates for the four key parameters, subsequently generating a grid of candidate configurations. It returns all of the candidate configurations in NeMo 2.0 format.
+   
+.. note::
+   Some of the candidate configurations may not work due to high-memory usage or other issues.
+
+Once the candidate configurations are generated, you can use NeMo Framework to launch the most promising candidates.
+   
+When running the candidates on the cluster, you can limit job time and job max steps by using ``max_minutes_per_run`` and ``max_steps_per_run`` parameters. During this search, the jobs will run with the number of nodes specified in the configuration files, using the ``num_nodes`` parameter. Once all of the jobs have finished running, you'll need to run compare_throughput.py to get a .csv table with performance results for each succeeded job.
+
+Optimal Configuration Recommendation
+####################################
+
+After all of the candidate jobs are done, Auto Configurator calculates performance parameters for each of the candidates. 
+Auto Configurator generates two .csv files: one detailing the performance measures of the candidates and another listing the candidates that failed due to out-of-memory errors.
+
+End-To-End Example
+##################
+
+The following list shows the required input parameters for the Auto Configurator runner:
+
+- ``model``: model configuration based on NeMo 2.0.
+- ``num_nodes``: number of nodes to be used for the training.
+- ``seq_length``: sequence length to be used for the training.
+- ``data_paths``: dataset to be used for the training.
+- ``tokenizer_path``: path to tokenizer model if custom tokenizer will be used.
+
+The following list shows the optional parameters for the Auto Configurator runner:
+
+- ``global_batch_size``: global batch size to be used.
+- ``tensor_parallel_sizes``: a list, such as ``[1, 2, 4]``.
+- ``pipeline_parallel_sizes``: a list, such as ``[1, 2, 4]``.
+- ``context_parallel_sizes``: a list, such as ``[1, 2, 4]``.
+- ``expert_parallel_sizes``: a list, such as ``[1, 2, 4]``.
+- ``micro_batch_sizes``: a list, such as ``[1, 2, 4]``.
+- ``min_model_parallel_size``: a value for the minimum desired parallelism.
+- ``max_model_parallel_size``: a value for the maximum desired parallelism.
+
+For each of the optional parameters, Auto Configurator will find the optimal value if the parameter is not specified. To view the full list of parameters, please visit `this page <https://github.com/NVIDIA/NeMo/blob/dpykhtar/nemo_autoconf/nemo/collections/llm/tools/auto_configurator/runner.py#L51>`__.
+
+To view an end-to-end example of how to generate candidate configs, train them, and calculate the performance using Auto Configurator with NeMo Framework, please visit `this page <https://github.com/NVIDIA/NeMo/blob/dpykhtar/nemo_autoconf/examples/llm/auto_configurator/auto_config.py>`__.
+

From 1aa3636fd4634b6ced9fb2119503b8f0ef210de2 Mon Sep 17 00:00:00 2001
From: dimapihtar <dpihtar@gmail.com>
Date: Fri, 6 Sep 2024 05:59:48 -0700
Subject: [PATCH 55/63] fix README

Signed-off-by: dimapihtar <dpihtar@gmail.com>
---
 examples/llm/auto_configurator/README.md | 25 ++++++++++++------------
 1 file changed, 12 insertions(+), 13 deletions(-)

diff --git a/examples/llm/auto_configurator/README.md b/examples/llm/auto_configurator/README.md
index a96548b1700f..72ad31c7a0a2 100644
--- a/examples/llm/auto_configurator/README.md
+++ b/examples/llm/auto_configurator/README.md
@@ -1,6 +1,5 @@
-=======================================================
 Use Auto Configurator to Find the Optimal Configuration
-=======================================================
+-------------------------------------------------------
 
 Auto Configurator searches for hyperparameters (HPs) that achieve the maximum highest training throughput when working with Large Language Models (LLMs) utilizing the NeMo Framework.
 
@@ -12,33 +11,33 @@ Auto Configurator Capabilities
 
 Auto Configurator is intended to iterate over different model configurations quickly and find the best configuration, that is, the configuration that minimizes both time and financial expenditure. It offers a range of features to facilitate this, as detailed in the list below.
 
-- Model size recommendation: finds the optimal model size if the parameter is not specified.
-- Training time estimation: estimates model training time based on input parameters.
-- Base configuration generation: returns a basic model configuration.
-- Hyperparameters recommendation: finds the optimal list of hyperparameters to be trained.
-- Optimal configuration recommendation: calculates the performance after a short training of candidate configurations and finds the optimal model configuration.
+- **Model size recommendation**: finds the optimal model size if the parameter is not specified.
+- **Training time estimation**: estimates model training time based on input parameters.
+- **Base configuration generation**: returns a basic model configuration.
+- **Hyperparameters recommendation**: finds the optimal list of hyperparameters to be trained.
+- **Optimal configuration recommendation**: calculates the performance after a short training of candidate configurations and finds the optimal model configuration.
 
 Model Size Recommendation
-#########################
+-------------------------
 
 If you have not decided what model size you want to train, Auto Configurator can recommend a model size for your use case. If you know the number of GPUs, TFLOPS per GPU, the maximum time to train, and the number of tokens to train for, it can recommend a model size that can be trained with the specified hardware and time constraints.
 
 For example, if you had 20 NVIDIA DGX nodes available (in 80\ |_GB| GPU memory), and wanted to train a GPT model for a maximum of 5\ |_days|, Auto Configurator would recommend using a 5B parameter GPT model.
 
 Training Time Estimation
-########################
+------------------------
 
 Auto Configurator calculates the estimated training time for your model. It provides a projection of the training time in days, based on the input dataset and parameters you provide.
 
 Base Configuration Generation
-#############################
+-----------------------------
 
 When you provide the model size, or Auto Configurator has suggested one, it generates a base configuration for the target model. The base configuration is a valid configuration in NeMo 2.0 format. The optimization of throughput, however, is conducted in the next step, :ref:`Training Auto Configurator HP Search <TrainingWithPredefinedConfigurations_TrainingAutoconfiguratorHpSearch>`.
 
 .. _TrainingWithPredefinedConfigurations_TrainingAutoconfiguratorHpSearch:
 
 Hyperparameters Recommendation
-##############################
+------------------------------
 
 After Auto Configurator generates the base configuration, it searches over four critical hyperparameters that have a great impact on training throughput but do not affect model convergence. These hyperparameters include  Tensor Parallelism (TP), Pipeline Parallelism (PP), Context Parallelism (CP), Expert Parallelism (EP), Micro Batch Size (MBS), and Activation Checkpointing Layers (ActCkpt). Auto Configurator will also provide optimal Global Batch Size (GBS) if it's not specified.
 
@@ -79,7 +78,7 @@ The following list shows the optional parameters for the Auto Configurator runne
 - ``min_model_parallel_size``: a value for the minimum desired parallelism.
 - ``max_model_parallel_size``: a value for the maximum desired parallelism.
 
-For each of the optional parameters, Auto Configurator will find the optimal value if the parameter is not specified. To view the full list of parameters, please visit `this page <https://github.com/NVIDIA/NeMo/blob/dpykhtar/nemo_autoconf/nemo/collections/llm/tools/auto_configurator/runner.py#L51>`__.
+For each of the optional parameters, Auto Configurator will find the optimal value if the parameter is not specified. To view the full list of parameters, please visit [this page](https://github.com/NVIDIA/NeMo/blob/dpykhtar/nemo_autoconf/nemo/collections/llm/tools/auto_configurator/runner.py#L51).
 
-To view an end-to-end example of how to generate candidate configs, train them, and calculate the performance using Auto Configurator with NeMo Framework, please visit `this page <https://github.com/NVIDIA/NeMo/blob/dpykhtar/nemo_autoconf/examples/llm/auto_configurator/auto_config.py>`__.
+To view an end-to-end example of how to generate candidate configs, train them, and calculate the performance using Auto Configurator with NeMo Framework, please visit [this page](https://github.com/NVIDIA/NeMo/blob/dpykhtar/nemo_autoconf/examples/llm/auto_configurator/auto_config.py).
 

From 649eb44edd79493e4d6bbf5f698a5ff60d901570 Mon Sep 17 00:00:00 2001
From: dimapihtar <dpihtar@gmail.com>
Date: Fri, 6 Sep 2024 06:02:30 -0700
Subject: [PATCH 56/63] fix README

Signed-off-by: dimapihtar <dpihtar@gmail.com>
---
 examples/llm/auto_configurator/README.md | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/examples/llm/auto_configurator/README.md b/examples/llm/auto_configurator/README.md
index 72ad31c7a0a2..8f4c1501ae40 100644
--- a/examples/llm/auto_configurator/README.md
+++ b/examples/llm/auto_configurator/README.md
@@ -3,8 +3,8 @@ Use Auto Configurator to Find the Optimal Configuration
 
 Auto Configurator searches for hyperparameters (HPs) that achieve the maximum highest training throughput when working with Large Language Models (LLMs) utilizing the NeMo Framework.
 
-.. note::
-   Auto Configurator is only supported now for GPT-based models: GPT3, LLama, Mixtral, Mistral, Gemma and Nemotron.
+> [!NOTE] 
+> Auto Configurator is only supported now for GPT-based models: GPT3, LLama, Mixtral, Mistral, Gemma and Nemotron.
 
 Auto Configurator Capabilities
 ------------------------------
@@ -22,7 +22,7 @@ Model Size Recommendation
 
 If you have not decided what model size you want to train, Auto Configurator can recommend a model size for your use case. If you know the number of GPUs, TFLOPS per GPU, the maximum time to train, and the number of tokens to train for, it can recommend a model size that can be trained with the specified hardware and time constraints.
 
-For example, if you had 20 NVIDIA DGX nodes available (in 80\ |_GB| GPU memory), and wanted to train a GPT model for a maximum of 5\ |_days|, Auto Configurator would recommend using a 5B parameter GPT model.
+For example, if you had 20 NVIDIA DGX nodes available (in 80 GB GPU memory), and wanted to train a GPT model for a maximum of 5 days, Auto Configurator would recommend using a 5B parameter GPT model.
 
 Training Time Estimation
 ------------------------
@@ -43,21 +43,21 @@ After Auto Configurator generates the base configuration, it searches over four
 
 Auto Configurator initially applies heuristics to identify suitable candidates for the four key parameters, subsequently generating a grid of candidate configurations. It returns all of the candidate configurations in NeMo 2.0 format.
    
-.. note::
-   Some of the candidate configurations may not work due to high-memory usage or other issues.
+> [!NOTE]
+> Some of the candidate configurations may not work due to high-memory usage or other issues.
 
 Once the candidate configurations are generated, you can use NeMo Framework to launch the most promising candidates.
    
 When running the candidates on the cluster, you can limit job time and job max steps by using ``max_minutes_per_run`` and ``max_steps_per_run`` parameters. During this search, the jobs will run with the number of nodes specified in the configuration files, using the ``num_nodes`` parameter. Once all of the jobs have finished running, you'll need to run compare_throughput.py to get a .csv table with performance results for each succeeded job.
 
 Optimal Configuration Recommendation
-####################################
+------------------------------------
 
 After all of the candidate jobs are done, Auto Configurator calculates performance parameters for each of the candidates. 
 Auto Configurator generates two .csv files: one detailing the performance measures of the candidates and another listing the candidates that failed due to out-of-memory errors.
 
 End-To-End Example
-##################
+------------------
 
 The following list shows the required input parameters for the Auto Configurator runner:
 

From c64228186ae3b5bd0948b158bda118c85cd561a9 Mon Sep 17 00:00:00 2001
From: dimapihtar <dpihtar@gmail.com>
Date: Fri, 6 Sep 2024 06:14:54 -0700
Subject: [PATCH 57/63] fix readme

Signed-off-by: dimapihtar <dpihtar@gmail.com>
---
 examples/llm/auto_configurator/README.md | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/examples/llm/auto_configurator/README.md b/examples/llm/auto_configurator/README.md
index 8f4c1501ae40..2833c13d076d 100644
--- a/examples/llm/auto_configurator/README.md
+++ b/examples/llm/auto_configurator/README.md
@@ -1,3 +1,6 @@
+> [!IMPORTANT] 
+> This is an early version of the Auto Configurator, and the code base can be modified as it will be integrated into the CLI.
+
 Use Auto Configurator to Find the Optimal Configuration
 -------------------------------------------------------
 
@@ -32,9 +35,7 @@ Auto Configurator calculates the estimated training time for your model. It prov
 Base Configuration Generation
 -----------------------------
 
-When you provide the model size, or Auto Configurator has suggested one, it generates a base configuration for the target model. The base configuration is a valid configuration in NeMo 2.0 format. The optimization of throughput, however, is conducted in the next step, :ref:`Training Auto Configurator HP Search <TrainingWithPredefinedConfigurations_TrainingAutoconfiguratorHpSearch>`.
-
-.. _TrainingWithPredefinedConfigurations_TrainingAutoconfiguratorHpSearch:
+When you provide the model size, or Auto Configurator has suggested one, it generates a base configuration for the target model. The base configuration is a valid configuration in NeMo 2.0 format. The optimization of throughput, however, is conducted in the next step.
 
 Hyperparameters Recommendation
 ------------------------------

From 73096038989cb7aff78b189d3a8490d9a77a4f54 Mon Sep 17 00:00:00 2001
From: dimapihtar <dpihtar@gmail.com>
Date: Fri, 6 Sep 2024 06:15:52 -0700
Subject: [PATCH 58/63] fix readme

Signed-off-by: dimapihtar <dpihtar@gmail.com>
---
 examples/llm/auto_configurator/README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/llm/auto_configurator/README.md b/examples/llm/auto_configurator/README.md
index 2833c13d076d..26cf5cd75263 100644
--- a/examples/llm/auto_configurator/README.md
+++ b/examples/llm/auto_configurator/README.md
@@ -49,13 +49,13 @@ Auto Configurator initially applies heuristics to identify suitable candidates f
 
 Once the candidate configurations are generated, you can use NeMo Framework to launch the most promising candidates.
    
-When running the candidates on the cluster, you can limit job time and job max steps by using ``max_minutes_per_run`` and ``max_steps_per_run`` parameters. During this search, the jobs will run with the number of nodes specified in the configuration files, using the ``num_nodes`` parameter. Once all of the jobs have finished running, you'll need to run compare_throughput.py to get a .csv table with performance results for each succeeded job.
+When running the candidates on the cluster, you can limit job time and job max steps by using ``max_minutes_per_run`` and ``max_steps_per_run`` parameters. During this search, the jobs will run with the number of nodes specified in the configuration files, using the ``num_nodes`` parameter. Once all of the jobs have finished running, you'll need to run compare_throughput.py to get a ``.csv`` table with performance results for each succeeded job.
 
 Optimal Configuration Recommendation
 ------------------------------------
 
 After all of the candidate jobs are done, Auto Configurator calculates performance parameters for each of the candidates. 
-Auto Configurator generates two .csv files: one detailing the performance measures of the candidates and another listing the candidates that failed due to out-of-memory errors.
+Auto Configurator generates two ``.csv`` files: one detailing the performance measures of the candidates and another listing the candidates that failed due to out-of-memory errors.
 
 End-To-End Example
 ------------------

From df1dcb8038ec49309fa931142e85bf2fa90832e7 Mon Sep 17 00:00:00 2001
From: dimapihtar <dpihtar@gmail.com>
Date: Fri, 6 Sep 2024 06:39:42 -0700
Subject: [PATCH 59/63] remove extra arg

Signed-off-by: dimapihtar <dpihtar@gmail.com>
---
 nemo/collections/llm/tools/auto_configurator/core/base_config.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/nemo/collections/llm/tools/auto_configurator/core/base_config.py b/nemo/collections/llm/tools/auto_configurator/core/base_config.py
index ce117dae49be..d60b6e43fdc1 100644
--- a/nemo/collections/llm/tools/auto_configurator/core/base_config.py
+++ b/nemo/collections/llm/tools/auto_configurator/core/base_config.py
@@ -202,7 +202,6 @@ def get_logger(self) -> Config[nl.NeMoLogger]:
         ckpt = Config(
             nl.ModelCheckpoint,
             monitor="reduced_train_loss",
-            save_best_model=False,
             save_last=False,
             save_top_k=0,
         )

From 25a148a9a6efe5479481325936a0ca51274c6e5a Mon Sep 17 00:00:00 2001
From: dimapihtar <dpihtar@gmail.com>
Date: Fri, 6 Sep 2024 07:21:42 -0700
Subject: [PATCH 60/63] remove unused imports

Signed-off-by: dimapihtar <dpihtar@gmail.com>
---
 examples/llm/auto_configurator/auto_config.py             | 2 --
 .../llm/tools/auto_configurator/core/base_config.py       | 8 +-------
 .../tools/auto_configurator/core/calculate_performance.py | 3 ---
 .../llm/tools/auto_configurator/core/training_config.py   | 5 +----
 .../collections/llm/tools/auto_configurator/core/utils.py | 4 ----
 nemo/collections/llm/tools/auto_configurator/runner.py    | 4 ----
 tests/collections/llm/auto_conf/test_base_configs.py      | 3 +--
 7 files changed, 3 insertions(+), 26 deletions(-)

diff --git a/examples/llm/auto_configurator/auto_config.py b/examples/llm/auto_configurator/auto_config.py
index 0dc07513e38a..c202d4d33325 100644
--- a/examples/llm/auto_configurator/auto_config.py
+++ b/examples/llm/auto_configurator/auto_config.py
@@ -14,7 +14,6 @@
 
 import argparse
 import os
-import shutil
 
 import fiddle as fdl
 import nemo_run as run
@@ -52,7 +51,6 @@ def train_config(args):
         max_steps_per_run=25,
         num_tokens_in_b=10,
         vocab_size=51200,
-        tokenizer_path="/home/models/gpt2",
         data_paths=args.data_path,
         path_to_logs=args.logs_dir,
     )
diff --git a/nemo/collections/llm/tools/auto_configurator/core/base_config.py b/nemo/collections/llm/tools/auto_configurator/core/base_config.py
index d60b6e43fdc1..ee1579f6f6e8 100644
--- a/nemo/collections/llm/tools/auto_configurator/core/base_config.py
+++ b/nemo/collections/llm/tools/auto_configurator/core/base_config.py
@@ -12,19 +12,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import math
-import os
-from dataclasses import dataclass, field
-from typing import Tuple
-
 import torch
 from megatron.core.optimizer import OptimizerConfig
 from pytorch_lightning.loggers import TensorBoardLogger
 
 from nemo import lightning as nl
 from nemo.collections.common.tokenizers import AutoTokenizer, SentencePieceTokenizer
-from nemo.collections.llm import GPTModel, PreTrainingDataModule
-from nemo.collections.llm.tools.auto_configurator.core.utils import generic_base_config
+from nemo.collections.llm import PreTrainingDataModule
 from nemo.collections.llm.utils import Config
 from nemo.lightning.pytorch.optim import CosineAnnealingScheduler, MegatronOptimizerModule
 from nemo.utils.exp_manager import TimingCallback
diff --git a/nemo/collections/llm/tools/auto_configurator/core/calculate_performance.py b/nemo/collections/llm/tools/auto_configurator/core/calculate_performance.py
index 43f1c6117929..5b7ac0ebc4d3 100644
--- a/nemo/collections/llm/tools/auto_configurator/core/calculate_performance.py
+++ b/nemo/collections/llm/tools/auto_configurator/core/calculate_performance.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import csv
 import os
 import re
 from typing import Optional
@@ -20,8 +19,6 @@
 import pandas as pd
 from tensorboard.backend.event_processing import event_accumulator
 
-from nemo.collections.llm.tools.auto_configurator.core.utils import generic_base_config
-
 
 def get_results(
     base_config=None,
diff --git a/nemo/collections/llm/tools/auto_configurator/core/training_config.py b/nemo/collections/llm/tools/auto_configurator/core/training_config.py
index 1a7c629aa583..087bf3c6fb0e 100644
--- a/nemo/collections/llm/tools/auto_configurator/core/training_config.py
+++ b/nemo/collections/llm/tools/auto_configurator/core/training_config.py
@@ -12,10 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import os
-import shutil
-import subprocess
-from dataclasses import dataclass, field
+from dataclasses import dataclass
 from typing import List, Tuple
 
 from nemo.collections.llm.tools.auto_configurator.core import utils
diff --git a/nemo/collections/llm/tools/auto_configurator/core/utils.py b/nemo/collections/llm/tools/auto_configurator/core/utils.py
index be9a6ed3e3a1..3441c7cdbf9b 100644
--- a/nemo/collections/llm/tools/auto_configurator/core/utils.py
+++ b/nemo/collections/llm/tools/auto_configurator/core/utils.py
@@ -12,11 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import copy
 from dataclasses import dataclass
-from typing import List, Optional, Tuple
-
-from nemo.collections.llm.utils import Config
 
 
 GPT_BASED_MODELS = [
diff --git a/nemo/collections/llm/tools/auto_configurator/runner.py b/nemo/collections/llm/tools/auto_configurator/runner.py
index 278f860acbea..0c80c9a21a9e 100644
--- a/nemo/collections/llm/tools/auto_configurator/runner.py
+++ b/nemo/collections/llm/tools/auto_configurator/runner.py
@@ -13,14 +13,10 @@
 # limitations under the License.
 
 import copy
-import os
 import re
 
-from dataclasses import dataclass
 from typing import List, Optional
 
-import nemo_run as run
-
 from nemo.collections.llm import GPTModel
 from nemo.collections.llm.api import pretrain
 from nemo.collections.llm.tools.auto_configurator.core.training_config import generate_grid_search_configs
diff --git a/tests/collections/llm/auto_conf/test_base_configs.py b/tests/collections/llm/auto_conf/test_base_configs.py
index 2e1a1aa5264e..2ff5b2983aa4 100644
--- a/tests/collections/llm/auto_conf/test_base_configs.py
+++ b/tests/collections/llm/auto_conf/test_base_configs.py
@@ -1,7 +1,6 @@
-import re
-
 import nemo_run as run
 import torch
+
 from megatron.core.optimizer import OptimizerConfig
 from pytorch_lightning.loggers import TensorBoardLogger
 

From f00637200cf4215ad50160b41b42a4b54453d913 Mon Sep 17 00:00:00 2001
From: dimapihtar <dpihtar@gmail.com>
Date: Fri, 6 Sep 2024 13:10:47 -0700
Subject: [PATCH 61/63] add nemo-run installation

Signed-off-by: dimapihtar <dpihtar@gmail.com>
---
 Dockerfile.ci | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/Dockerfile.ci b/Dockerfile.ci
index 3d9a9d9b08a1..33490a6d9079 100644
--- a/Dockerfile.ci
+++ b/Dockerfile.ci
@@ -31,6 +31,10 @@ EOF
 
 WORKDIR /workspace
 
+RUN pip install hatchling   # needed to install nemo-run
+ARG NEMU_RUN_TAG=34259bd3e752fef94045a9a019e4aaf62bd11ce2
+RUN pip install nemo_run@git+https://github.com/NVIDIA/NeMo-Run.git@${NEMU_RUN_TAG}
+
 # Install NeMo requirements
 ARG TE_TAG=7d576ed25266a17a7b651f2c12e8498f67e0baea
 ARG MODELOPT_VERSION=0.15.0

From 9dda193f318541b9a86f7f46c0712ea240a18a4a Mon Sep 17 00:00:00 2001
From: dimapihtar <dpihtar@gmail.com>
Date: Fri, 6 Sep 2024 22:48:20 -0700
Subject: [PATCH 62/63] fix unit tests

Signed-off-by: dimapihtar <dpihtar@gmail.com>
---
 tests/collections/llm/auto_conf/test_base_configs.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/collections/llm/auto_conf/test_base_configs.py b/tests/collections/llm/auto_conf/test_base_configs.py
index 2ff5b2983aa4..46ee49ae0629 100644
--- a/tests/collections/llm/auto_conf/test_base_configs.py
+++ b/tests/collections/llm/auto_conf/test_base_configs.py
@@ -111,7 +111,6 @@ def get_logger() -> run.Config[nl.NeMoLogger]:
     ckpt = run.Config(
         nl.ModelCheckpoint,
         monitor="reduced_train_loss",
-        save_best_model=False,
         save_last=False,
         save_top_k=0,
     )

From c4c5ecbc25e4a83824f3b237053e37fa5198f59d Mon Sep 17 00:00:00 2001
From: dimapihtar <dpihtar@gmail.com>
Date: Sat, 7 Sep 2024 11:36:58 -0700
Subject: [PATCH 63/63] fix unit tests

Signed-off-by: dimapihtar <dpihtar@gmail.com>
---
 .../llm/auto_conf/{test_utils.py => test_autoconf_utils.py}       | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename tests/collections/llm/auto_conf/{test_utils.py => test_autoconf_utils.py} (100%)

diff --git a/tests/collections/llm/auto_conf/test_utils.py b/tests/collections/llm/auto_conf/test_autoconf_utils.py
similarity index 100%
rename from tests/collections/llm/auto_conf/test_utils.py
rename to tests/collections/llm/auto_conf/test_autoconf_utils.py