Skip to content

Commit

Permalink
fix mixtraltopk (NVIDIA#10366)
Browse files Browse the repository at this point in the history
Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>
Co-authored-by: Marc Romeyn <mromeijn@nvidia.com>
Signed-off-by: adityavavre <aditya.vavre@gmail.com>
  • Loading branch information
2 people authored and adityavavre committed Sep 15, 2024
1 parent 0653829 commit 3a67a0d
Showing 1 changed file with 2 additions and 5 deletions.
7 changes: 2 additions & 5 deletions nemo/collections/llm/gpt/model/mixtral.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ class MixtralConfig(GPTConfig):
moe_aux_loss_coeff: float = 0.01
moe_expert_capacity_factor: float = 1.0
moe_pad_expert_input_to_capacity: bool = True
moe_router_topk: int = 1
moe_router_topk: int = 2
moe_router_pre_softmax: bool = True
moe_token_dispatcher_type: str = "alltoall"

Expand Down Expand Up @@ -104,7 +104,7 @@ class MixtralConfig8x7B(MixtralConfig):
@dataclass
class MixtralConfig8x22B(MixtralConfig):
"""
Config for Mixtral-8x7B model
Config for Mixtral-8x22B model
Official announcement: https://mistral.ai/news/mixtral-8x22b/
"""

Expand All @@ -114,9 +114,6 @@ class MixtralConfig8x22B(MixtralConfig):
ffn_hidden_size: int = 16384
max_position_embeddings: int = 4096
seq_length: int = 4096
# MoE
num_moe_experts: int = 8
moe_router_topk: int = 2


class MixtralModel(GPTModel):
Expand Down

0 comments on commit 3a67a0d

Please sign in to comment.