NVIDIA · dimapihtar · Sep 7, 2024 · Aug 27, 2024 · Aug 27, 2024 · Aug 27, 2024
diff --git a/nemo/collections/llm/tools/auto_configurator/__init__.py b/nemo/collections/llm/tools/auto_configurator/__init__.py
@@ -0,0 +1,2 @@
+from nemo.collections.llm.tools.auto_configurator.core.calculate_performance import get_results
+from nemo.collections.llm.tools.auto_configurator.runner import AutoConfigurator
diff --git a/nemo/collections/llm/tools/auto_configurator/base_configs/__init__.py b/nemo/collections/llm/tools/auto_configurator/base_configs/__init__.py
@@ -0,0 +1,20 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from nemo.collections.llm.tools.auto_configurator.base_configs.custom import custom
+from nemo.collections.llm.tools.auto_configurator.base_configs.gemma import Gemma
+from nemo.collections.llm.tools.auto_configurator.base_configs.gpt import GPT
+from nemo.collections.llm.tools.auto_configurator.base_configs.llama import Llama
+from nemo.collections.llm.tools.auto_configurator.base_configs.mistral import Mistral
+from nemo.collections.llm.tools.auto_configurator.base_configs.mixtral import Mixtral
diff --git a/nemo/collections/llm/tools/auto_configurator/base_configs/basic.py b/nemo/collections/llm/tools/auto_configurator/base_configs/basic.py
@@ -0,0 +1,144 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+from megatron.core.optimizer import OptimizerConfig
+
+from nemo.collections.llm.utils import Config
+
+
+class Basic:
+    def __init__(
+        self,
+        name: str = None,
+        version: int = None,
+        size: int = None,
+        measure: str = "B",
+        cfg: dict = {},
+    ):
+        """
+        :param str name: model name.
+        :param int version: model version.
+        :param int size: model size.
+        :param str measure: meausre of model size. "M" if model size in millions, "B" if in billions.
+        :param dict cfg: auto configurator runner config.
+        """
+
+        self.name = name
+        self.version = version
+        self.size = size
+        self.measure = measure
+        self.cfg = cfg
+        self.num_nodes = cfg.get("num_nodes")
+        self.num_gpus = cfg.get("num_gpus")
+        self.max_steps = cfg.get("max_steps_per_run")
+        self.seq_length = cfg.get("seq_length")
+        self.global_batch_size = cfg.get("global_batch_size")
+        self.tokenizer_path = cfg.get("tokenizer_path")
+        self.data_paths = cfg.get("data_paths")
+        self.nemo_run = cfg.get("nemo_run")
+        self.max_minutes_per_run = cfg.get("max_minutes_per_run")
+
+    def model_config(self):
+        """Function that returns model config."""
+
+        None
+
+    def get_optim_config(self) -> OptimizerConfig:
+        """
+        Function that returns optimizer config.
+        :return: optim config.
+        :rtype: OptimizerConfig.
+        """
+        optim_params = {
+            "optimizer": "adam",
+            "lr": 1e-4,
+            "min_lr": 1e-5,
+            "use_distributed_optimizer": True,
+            "bf16": True,
+            "adam_beta1": 0.9,
+            "adam_beta2": 0.95,
+            "overlap_grad_reduce": False,
+            "overlap_param_gather": True,
+        }
+
+        if self.nemo_run:
+            optim_config = Config(
+                OptimizerConfig,
+                **optim_params,
+            )
+        else:
+            optim_config = OptimizerConfig(
+                **optim_params,
+            )
+
+        return optim_config
+
+    def get_trainer_config(self) -> dict:
+        """
+        Function that returns config for PTL trainer.
+        :return: trainer config.
+        :rtype: dict.
+        """
+
+        trainer_config = {
+            "accelerator": "gpu",
+            "enable_checkpointing": False,
+            "use_distributed_sampler": False,
+            "max_epochs": None,
+            "log_every_n_steps": 1,
+            "limit_val_batches": 1,
+            "limit_test_batches": 1,
+            "accumulate_grad_batches": 1,
+            "gradient_clip_val": 1.0,
+            "num_nodes": self.num_nodes,
+            "devices": self.num_gpus,
+            "max_steps": self.max_steps,
+            "val_check_interval": self.max_steps,
+        }
+
+        return trainer_config
+
+    def get_data_config(self) -> dict:
+        """
+        Function that returns dataset config.
+        :return: data config.
+        :rtype: dict.
+        """
+
+        data_config = {
+            "paths": self.data_paths,
+            "seq_length": self.seq_length,
+            "global_batch_size": self.global_batch_size,
+            "num_workers": 2,
+            # "split": "99990,8,2",
+            "index_mapping_dir": None,
+        }
+
+        return data_config
+
+    def get_run_config(self) -> dict:
+        """
+        Function that returns config for cluster job.
+        :return: cluster job config.
+        :rtype: dict.
+        """
+
+        run_config = {
+            "name": f"{self.name}_{self.size}{self.measure}",
+            "results_dir": None,
+            "time_limit": f"0-00:{self.max_minutes_per_run}:00",
+        }
+
+        return run_config
diff --git a/nemo/collections/llm/tools/auto_configurator/base_configs/custom.py b/nemo/collections/llm/tools/auto_configurator/base_configs/custom.py
@@ -0,0 +1,42 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import copy
+import os
+
+from nemo.collections.llm.tools.auto_configurator import base_configs
+
+from .basic import Basic
+
+
+def custom(name, cfg):
+    """
+    Function taht return custom model class.
+    :param dict cfg: auto configurator runner config.
+    :return: Custom class object.
+    """
+    basic_class = getattr(base_configs, name)
+
+    class Custom(basic_class):
+        def __init__(self, name, cfg):
+            """
+            :param str name: model name.
+            :param dict cfg: auto configurator runner config.
+            """
+
+            super().__init__(name=name, cfg=cfg)
+
+    custom_class = Custom(name, cfg)
+
+    return custom_class
diff --git a/nemo/collections/llm/tools/auto_configurator/base_configs/gemma.py b/nemo/collections/llm/tools/auto_configurator/base_configs/gemma.py
@@ -0,0 +1,64 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import copy
+import os
+import torch
+
+from nemo.collections import llm
+from nemo.collections.llm.utils import Config
+
+from .basic import Basic
+
+
+class Gemma(Basic):
+    def __init__(
+        self,
+        name: str = "Gemma",
+        version: int = None,
+        size: int = 2,
+        measure: str = "B",
+        cfg: dict = {},
+    ):
+        """
+        :param str name: model name.
+        :param int version: model version.
+        :param int size: model size.
+        :param str measure: meausre of model size. "M" if model size in millions, "B" if in billions.
+        :param dict cfg: auto configurator runner config.
+        """
+
+        super().__init__(name=name, version=version, size=size, measure=measure, cfg=cfg)
+        self.config_name = f"{self.name}Config{self.size}{self.measure}"
+
+    def get_model_config(self) -> Config:
+        """
+        Function that returns model config.
+        :return: model config.
+        :rtype: Config.
+        """
+
+        model_class = getattr(llm, self.config_name)
+        kwargs = self.cfg.get("model_args", {})
+
+        if self.nemo_run:
+            model_config = Config(model_class, **kwargs)
+        else:
+            model_config = model_class(**kwargs)
+
+        model_config.global_batch_size = self.global_batch_size
+        model_config.seq_length = self.seq_length
+        model_config.pipeline_dtype = torch.bfloat16
+
+        return model_config
diff --git a/nemo/collections/llm/tools/auto_configurator/base_configs/gpt.py b/nemo/collections/llm/tools/auto_configurator/base_configs/gpt.py
@@ -0,0 +1,62 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import copy
+import os
+
+from nemo.collections import llm
+from nemo.collections.llm.utils import Config
+
+from .basic import Basic
+
+
+class GPT(Basic):
+    def __init__(
+        self,
+        name: str = "GPT",
+        version: int = 3,
+        size: int = 5,
+        measure: str = "B",
+        cfg: dict = {},
+    ):
+        """
+        :param str name: model name.
+        :param int version: model version.
+        :param int size: model size.
+        :param str measure: meausre of model size. "M" if model size in millions, "B" if in billions.
+        :param dict cfg: auto configurator runner config.
+        """
+
+        super().__init__(name=name, version=version, size=size, measure=measure, cfg=cfg)
+        self.config_name = f"{self.name}Config{self.size}{self.measure}"
+
+    def get_model_config(self) -> Config:
+        """
+        Function that returns model config.
+        :return: model config.
+        :rtype: Config.
+        """
+
+        model_class = getattr(llm, self.config_name)
+        kwargs = self.cfg.get("model_args", {})
+
+        if self.nemo_run:
+            model_config = Config(model_class, **kwargs)
+        else:
+            model_config = model_class(**kwargs)
+
+        model_config.global_batch_size = self.global_batch_size
+        model_config.seq_length = self.seq_length
+
+        return model_config