From dc4b81fe0e5286b395a4a4a65163ba4f4afb98af Mon Sep 17 00:00:00 2001 From: jahatef Date: Fri, 25 Oct 2024 00:14:25 +0000 Subject: [PATCH] config adjustments for llama and gated activations --- configs/llama/13B.yml | 1 + configs/llama/30B.yml | 1 + configs/llama/65B.yml | 1 + configs/llama/7B.yml | 1 + configs/llama/train_config.yml | 2 +- megatron/model/transformer.py | 4 ++-- 6 files changed, 7 insertions(+), 3 deletions(-) diff --git a/configs/llama/13B.yml b/configs/llama/13B.yml index a4a5c6e86..a7470cae8 100644 --- a/configs/llama/13B.yml +++ b/configs/llama/13B.yml @@ -17,6 +17,7 @@ "output_layer_parallelism": "column", "norm": "rmsnorm", "rms_norm_epsilon": 1.0e-6, + "use_bias_in_mlp": False, "scaled_upper_triang_masked_softmax_fusion": true, "bias_gelu_fusion": false, diff --git a/configs/llama/30B.yml b/configs/llama/30B.yml index f7d34893d..234445c77 100644 --- a/configs/llama/30B.yml +++ b/configs/llama/30B.yml @@ -17,6 +17,7 @@ "output_layer_parallelism": "column", "norm": "rmsnorm", "rms_norm_epsilon": 1.0e-6, + "use_bias_in_mlp": False, "scaled_upper_triang_masked_softmax_fusion": true, "bias_gelu_fusion": false, diff --git a/configs/llama/65B.yml b/configs/llama/65B.yml index 92c1da18c..8ffffe241 100644 --- a/configs/llama/65B.yml +++ b/configs/llama/65B.yml @@ -17,6 +17,7 @@ "output_layer_parallelism": "column", "norm": "rmsnorm", "rms_norm_epsilon": 1.0e-6, + "use_bias_in_mlp": False, "scaled_upper_triang_masked_softmax_fusion": true, "bias_gelu_fusion": false, diff --git a/configs/llama/7B.yml b/configs/llama/7B.yml index 85982b8ea..0d7c40b24 100644 --- a/configs/llama/7B.yml +++ b/configs/llama/7B.yml @@ -17,6 +17,7 @@ "output_layer_parallelism": "column", "norm": "rmsnorm", "rms_norm_epsilon": 1.0e-6, + "use_bias_in_mlp": False, "scaled_upper_triang_masked_softmax_fusion": true, "bias_gelu_fusion": false, diff --git a/configs/llama/train_config.yml b/configs/llama/train_config.yml index 7cc5a5968..36b0d66be 100644 --- a/configs/llama/train_config.yml +++ b/configs/llama/train_config.yml @@ -70,5 +70,5 @@ "steps_per_print": 10, "keep_last_n_checkpoints": 4, "wall_clock_breakdown": true, - "mlp_multiple_of": 256, + } diff --git a/megatron/model/transformer.py b/megatron/model/transformer.py index d112a7461..c30f3a64e 100644 --- a/megatron/model/transformer.py +++ b/megatron/model/transformer.py @@ -1269,8 +1269,8 @@ def forward(self, x, attention_mask, layer_past=None): with torch.enable_grad() if not self.eval else nullcontext(): if ( - self.activation == "swiglu" - or self.num_experts > 1 + mlp_bias == None, + self.num_experts > 1 and self.moe_type == "deepspeed" ): # No dropout either