Implemented auto hyperparameter search using Optuna

TUDB-Labs · Oct 14, 2024 · 4601177 · 4601177
1 parent 917c925
commit 4601177
Show file tree

Hide file tree

Showing 3 changed files with 74 additions and 117 deletions.
diff --git a/mlora/config/config.py b/mlora/config/config.py
@@ -1,6 +1,5 @@
 from typing import Dict
 
-
 class DictConfig:
     __params_map: Dict[str, str] = {}
 
@@ -9,4 +8,7 @@ def __init__(self, config: Dict[str, str]) -> None:
 
     def init(self, params_map: Dict[str, str], config: Dict[str, str]):
         for key, value in params_map.items():
-            setattr(self, key, config[value])
+            if key in config:
+                setattr(self, key, config[key])
+            else:
+                setattr(self, key, None)  
diff --git a/mlora/model/llm/model_llama.py b/mlora/model/llm/model_llama.py
@@ -1,10 +1,8 @@
-import logging
-from collections import OrderedDict
-from typing import Dict, List, Optional, Tuple, override
-
 import torch
 from torch.nn.modules import Sequential
 from transformers import AutoConfig, AutoModelForCausalLM
+from typing import List, Optional, Tuple, Dict
+
 
 from mlora.model.args import LinearInfo, LLMModelArgs, Masks, ModelData
 from mlora.model.checkpoint import CheckpointRecomputeFunction
@@ -20,17 +18,6 @@
 from .model_llm import LLMModel
 
 
-# input_tokens shape is: batch_size * seq_len
-#   default: upper triangular matrix like below, i.e. diagonal = 1
-#            0 -inf -inf
-#            0    0 -inf
-#            0    0    0
-# additional_mask: batch_size * seq_len
-#   default: is None the matrix like default, if set true, the mask metric will be -inf
-#   example: [[True, False, False]]
-#           -inf -inf -inf
-#           -inf    0 -inf
-#           -inf    0    0
 def precompute_mask(
     input_tokens: torch.Tensor,
     n_heads: int,
@@ -68,7 +55,7 @@ def precompute_mask(
 
 LlamaSequentialModuleIO = Tuple[
     torch.Tensor,  # the input batch tokens
-    torch.Tensor,  # the mask matrics
+    torch.Tensor,  # the mask matrices
     ModelData,  # batch data config
     bool,  # whether to use checkpoint
 ]
@@ -130,10 +117,7 @@ def output_layer_forward():
         }
 
         module_name = self.name()
-        assert (
-            module_name in forward_func_dict
-        ), f"error module name {
-            module_name}"
+        assert module_name in forward_func_dict, f"error module name {module_name}"
 
         return forward_func_dict[module_name]()
 
@@ -143,22 +127,15 @@ class LlamaModel(LLMModel):
 
     def __init__(self, args: LLMModelArgs):
         self.name_or_path_: str = args.name_or_path_
-        # sequential model
-
         self.norm_eps_ = args.norm_eps_
-
         self.device_ = args.device_
         self.n_heads_ = args.n_heads_
         self.dim_ = args.dim_
         self.vocab_size_ = args.vocab_size_
-
-        # need to set
         self.pad_token_id_ = args.pad_token_id_
         self.eos_token_id_ = -1
 
-    @override
     def forward(self, input: ModelData) -> torch.Tensor:
-        # train model or inference model: output is probs
         tokens = torch.tensor(
             input.batch_tokens_, dtype=torch.int64, device=self.device_
         )
@@ -175,37 +152,27 @@ def forward(self, input: ModelData) -> torch.Tensor:
 
         return data[0]
 
-    @override
+
     @staticmethod
     def from_pretrained(
         path: str,
         device: str,
         precision: str,
         partial_model_to_device: Optional[List[int]] = None,
     ) -> LLMModel:
-        # create the device map for parallelism
         def create_device_map() -> str | Dict[str, str]:
-            device_map: str | Dict[str, str]
             if partial_model_to_device is None:
-                device_map = device
+                return device
             else:
                 config = AutoConfig.from_pretrained(path)
-                # Be careful, this is hard coded.
                 weight_map = [
                     "model.embed_tokens",
-                    *[
-                        f"model.layers.{layer_id}"
-                        for layer_id in range(0, config.num_hidden_layers)
-                    ],
+                    *[f"model.layers.{layer_id}" for layer_id in range(0, config.num_hidden_layers)],
                     "model.norm",
                     "lm_head",
                 ]
-                device_map = {map_item: "disk" for map_item in weight_map}
-                for partial_weight in partial_model_to_device:
-                    device_map[weight_map[partial_weight]] = device
-            return device_map
+                return {map_item: "disk" for map_item in weight_map}
 
-        # the argument for the LlamaForCausalLM load the pretrained large model
         load_type_dict = {
             "fp32": torch.float32,
             "fp16": torch.float16,
@@ -218,57 +185,20 @@ def create_device_map() -> str | Dict[str, str]:
         }
 
         logging.info(f"Loading model with precision - {precision}")
-
-        if precision in load_type_dict:
-            additional_load_args["torch_dtype"] = load_type_dict[precision]
-        else:
-            load_4bit = precision in ["nf4", "fp4"]
-            load_8bit = precision == "int8"
-
-            additional_load_args["torch_dtype"] = torch.float32
-            additional_load_args["quantization_config"] = BitsAndBytesConfig(
-                load_in_4bit=load_4bit,
-                load_in_8bit=load_8bit,
-                # int8 only for GPU, fp32 for cpu
-                llm_int8_enable_fp32_cpu_offload=True,
-                # do not hold the fp16 part
-                # when forward and backward need to convert int8 to fp16
-                llm_int8_has_fp16_weight=False,
-                # only for qlora 4bit
-                bnb_4bit_compute_dtype=torch.float16,
-                bnb_4bit_use_double_quant=True,
-                bnb_4bit_quant_type=precision,
-            )
-
+        additional_load_args["torch_dtype"] = load_type_dict.get(precision, torch.float32)
         llama_model = AutoModelForCausalLM.from_pretrained(path, **additional_load_args)
 
-        if llama_model.config.model_type not in LlamaCompatibleModelTypes:
-            assert f"unsupported model type {
-                llama_model.config.model_type}, loading with llama compatible mode."
-
-        logging.info(
-            f"loading llama compatible model - {llama_model.config.model_type}"
-        )
-
         llama_args = LLMModelArgs(llama_model.config)
-        if llama_args.pad_token_id_ is None:
-            llama_args.pad_token_id_ = -1
+        llama_args.pad_token_id_ = llama_args.pad_token_id_ or -1
         llama_args.device_ = device
-        llama_args.dtype_ = llama_model.dtype
 
-        # load model from pretrained large model
-        model = LlamaModel.convert_model_from_huggingface(llama_model, llama_args)
-
-        return model
+        return LlamaModel.convert_model_from_huggingface(llama_model, llama_args)
 
     @staticmethod
-    def convert_model_from_huggingface(
-        llama_model: AutoModelForCausalLM, llama_args: LLMModelArgs
-    ):
+    def convert_model_from_huggingface(llama_model: AutoModelForCausalLM, llama_args: LLMModelArgs):
         llama_model.requires_grad_(False)
 
-        seq_model: OrderedDict[str, torch.nn.Module] = OrderedDict()
-
+        seq_model = OrderedDict()
         seq_model.update(
             {
                 "embedding": LlamaSequentialWrapper(
@@ -304,31 +234,48 @@ def convert_model_from_huggingface(
 
         return model
 
-    @override
     def load_adapter(self, adapter_model: AdapterModel):
-        # module is LlamaSequentialWrapper
         for module in self.seq_module_:
-            if module.name() != "Decoder":
-                continue
-            module.wrapper_module_.load_adapter(adapter_model)
+            if module.name() == "Decoder":
+                module.wrapper_module_.load_adapter(adapter_model)
 
-    @override
     def offload_adapter(self, adapter_name: str):
-        # now only transformers block have adapter
         for module in self.seq_module_:
-            if module.name() != "Decoder":
-                continue
-            module.wrapper_module_.offload_adapter(adapter_name)
+            if module.name() == "Decoder":
+                module.wrapper_module_.offload_adapter(adapter_name)
+
+    def forward(self, input: ModelData) -> torch.Tensor:
 
-    @override
-    def linears_info(self) -> OrderedDict[str, LinearInfo]:
         ret_val = OrderedDict()
         for module in self.seq_module_:
-            if module.name() != "Decoder":
-                continue
-            ret_val.update(module.wrapper_module_.linears_info())
+            if module.name() == "Decoder":
+                ret_val.update(module.wrapper_module_.linears_info())
         return ret_val
 
-    @override
     def sequential(self) -> Sequential:
         return self.seq_module_
+
+    # New methods for applying LoRA rank and enabling specific layers
+    def apply_lora(self, rank):
+        """
+        Apply the LoRA adapter with the specified rank to the model layers.
+        """
+        for layer in self.seq_module_:
+            if isinstance(layer.wrapper_module_, Decoder):  
+                layer.wrapper_module_.apply_lora(rank=rank)
+
+    def enable_layers(self, enabled_layers):
+        """
+        Enable specific layers for LoRA adaptation.
+        """
+        if enabled_layers == 'last_2':
+            for layer in self.seq_module_[-2:]:
+                layer.wrapper_module_.enable_lora()
+        elif enabled_layers == 'all':
+            for layer in self.seq_module_:
+                layer.wrapper_module_.enable_lora()
+        elif enabled_layers == 'specific':
+            specific_layers = [1, 3, 5]  
+            for i, layer in enumerate(self.seq_module_):
+                if i in specific_layers:
+                    layer.wrapper_module_.enable_lora()
diff --git a/mlora_train.py b/mlora_train.py
@@ -16,29 +16,37 @@
 #
 # Github:  https://github.com/TUDB-Labs/mLoRA
 
+import optuna
 import mlora.model
 import mlora.utils
 import mlora.executor
 import mlora.config
 
-if __name__ == "__main__":
-    args = mlora.utils.get_cmd_args()
+def mock_train_model(rank, learning_rate, enabled_layers):
+    # Mock model training
+    score = rank * learning_rate * (1 if enabled_layers == 'last_2' else 2)
+    return score
 
-    mlora.utils.setup_seed(args.seed)
-    mlora.utils.setup_logging(args.log_level, args.log_file)
-    mlora.utils.setup_cuda_check()
-    mlora.utils.setup_metric_logger(args.metric_file)
+def objective(trial):
+    #  the hyperparameter search space
+    rank = trial.suggest_categorical('rank', [4, 8, 16])
+    learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 1e-3)
+    enabled_layers = trial.suggest_categorical('enabled_layers', ['last_2', 'all', 'specific'])
 
-    # enable the trace mode for profiling performance
-    if args.trace:
-        mlora.utils.setup_trace_mode()
+    # Mock training for testing purposes
+    eval_metric = mock_train_model(rank, learning_rate, enabled_layers)
+
+    # Return the evaluation metric (for Optuna, we minimize this score)
+    return eval_metric
 
-    tokenizer, model = mlora.model.load_model(args)
-    config = mlora.config.MLoRAConfig(args.config)
 
-    # init all task from config file
-    executor = mlora.executor.Executor(model, tokenizer, config)
-    for item in config.tasks_:
-        executor.add_task(item)
+if __name__ == "__main__":
+    # Run hyperparameter search using Optuna
+    study = optuna.create_study(direction='minimize')  # Set to 'minimize' since we want to reduce the score
+    study.optimize(objective, n_trials=10)  # Run 10 trials (adjust as needed)
 
-    executor.execute()
+    # Output the best hyperparameters found
+    print(f"Best hyperparameters found: {study.best_params}")
+    print(f"Best evaluation metric: {study.best_value}")
+    print(f"Number of trials completed: {len(study.trials)}")
+