Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[LLM] Add expert parallel #9368

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
9414d67
add expert parallel utils
DrownFish19 Sep 30, 2024
a6260cf
update gates
DrownFish19 Oct 9, 2024
39d1660
update
DrownFish19 Oct 16, 2024
86ee3cc
Merge remote-tracking branch 'paddlenlp/develop' into dev_20241018_ad…
DrownFish19 Oct 21, 2024
cc41578
update base methods
DrownFish19 Oct 21, 2024
0fcba13
Merge remote-tracking branch 'paddlenlp/develop' into dev_20241018_ad…
DrownFish19 Oct 22, 2024
2b74f30
update moe_layer
DrownFish19 Oct 24, 2024
2a24fda
Merge remote-tracking branch 'paddlenlp/develop' into dev_20241018_ad…
DrownFish19 Oct 24, 2024
f517473
update moebase
DrownFish19 Oct 24, 2024
1a3399e
add moe_gate and moe_layer for qwen2moe
DrownFish19 Oct 24, 2024
d6a16eb
add config
DrownFish19 Oct 30, 2024
440b848
Merge branch 'PaddlePaddle:develop' into dev_20241018_add_expert_para…
DrownFish19 Oct 30, 2024
fad1a4f
update
DrownFish19 Oct 31, 2024
e0f3e93
Merge remote-tracking branch 'paddlenlp/develop' into dev_20241018_ad…
DrownFish19 Oct 31, 2024
8701b52
update gate dtype
DrownFish19 Nov 4, 2024
4af8a68
Merge branch 'PaddlePaddle:develop' into dev_20241018_add_expert_para…
DrownFish19 Nov 4, 2024
4ef7d4f
Merge branch 'PaddlePaddle:develop' into dev_20241018_add_expert_para…
DrownFish19 Nov 4, 2024
448ecbd
update moe gate and layer
DrownFish19 Nov 4, 2024
0a3af3b
Merge remote-tracking branch 'paddlenlp/develop' into dev_20241018_ad…
DrownFish19 Nov 5, 2024
77ec9b0
update moe_layer.py
DrownFish19 Nov 5, 2024
ff93012
update
DrownFish19 Nov 6, 2024
de2d257
update
DrownFish19 Nov 7, 2024
83bdedd
update token_priority method
DrownFish19 Nov 7, 2024
63f6755
update data type
DrownFish19 Nov 7, 2024
17537b3
remove old moe
DrownFish19 Nov 8, 2024
88a91a1
Merge remote-tracking branch 'paddlenlp/develop' into dev_20241018_ad…
DrownFish19 Nov 8, 2024
2b0bf16
fix moe capacity reduce.Max
DrownFish19 Nov 11, 2024
801f0ff
update comment
DrownFish19 Nov 13, 2024
358483b
lint
DrownFish19 Nov 15, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 34 additions & 0 deletions llm/config/qwen2moe/lora_argument.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
{
"model_name_or_path": "Qwen/Qwen2-57B-A14B",
"dataset_name_or_path": "./data",
"output_dir": "./checkpoints/lora_ckpts",
"per_device_train_batch_size": 4,
"gradient_accumulation_steps": 4,
"per_device_eval_batch_size": 8,
"eval_accumulation_steps":16,
"num_train_epochs": 3,
"learning_rate": 3e-04,
"warmup_steps": 30,
"logging_steps": 1,
"evaluation_strategy": "epoch",
"save_strategy": "epoch",
"src_length": 1024,
"max_length": 2048,
"bf16": true,
"fp16_opt_level": "O2",
"do_train": true,
"do_eval": true,
"disable_tqdm": true,
"load_best_model_at_end": true,
"eval_with_do_generation": false,
"metric_for_best_model": "accuracy",
"recompute": true,
"save_total_limit": 1,
"tensor_parallel_degree": 1,
"pipeline_parallel_degree": 1,
"lora": true,
"unified_checkpoint": true,
"zero_padding": false,
"use_flash_attention": true,
"pissa": false
}
40 changes: 40 additions & 0 deletions llm/config/qwen2moe/pretrain_argument.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
{
"model_name_or_path": "Qwen/Qwen2-57B-A14B",
"tokenizer_name_or_path": "Qwen/Qwen2-57B-A14B",
"input_dir": "./data",
"output_dir": "./checkpoints/pretrain_ckpts",
"per_device_train_batch_size": 2,
"gradient_accumulation_steps": 1,
"per_device_eval_batch_size": 2,
"tensor_parallel_degree": 1,
"pipeline_parallel_degree": 1,
"sharding": "stage2",
"virtual_pp_degree": 1,
"sequence_parallel": 0,
"use_flash_attention": true,
"use_fused_rms_norm": true,
"max_seq_length": 4096,
"learning_rate": 3e-05,
"min_learning_rate": 3e-06,
"warmup_steps": 30,
"logging_steps": 1,
"max_steps": 10000,
"save_steps": 5000,
"eval_steps": 1000,
"weight_decay": 0.01,
"bf16": true,
"fp16_opt_level": "O2",
"warmup_ratio": 0.01,
"max_grad_norm": 1.0,
"dataloader_num_workers": 1,
"continue_training": 1,
"do_train": true,
"do_eval": true,
"do_predict": true,
"disable_tqdm": true,
"recompute": true,
"distributed_dataloader": 1,
"recompute_granularity": "full",
"unified_checkpoint": true,
"save_total_limit": 2
}
33 changes: 33 additions & 0 deletions llm/config/qwen2moe/sft_argument.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
{
"model_name_or_path": "Qwen/Qwen2-57B-A14B",
"dataset_name_or_path": "./data",
"output_dir": "./checkpoints/sft_ckpts",
"per_device_train_batch_size": 1,
"gradient_accumulation_steps": 4,
"per_device_eval_batch_size": 8,
"eval_accumulation_steps":16,
"num_train_epochs": 3,
"learning_rate": 3e-05,
"warmup_steps": 30,
"logging_steps": 1,
"evaluation_strategy": "epoch",
"save_strategy": "epoch",
"src_length": 1024,
"max_length": 2048,
"bf16": true,
"fp16_opt_level": "O2",
"do_train": true,
"do_eval": true,
"disable_tqdm": true,
"load_best_model_at_end": true,
"eval_with_do_generation": false,
"metric_for_best_model": "accuracy",
"recompute": true,
"save_total_limit": 1,
"tensor_parallel_degree": 1,
"pipeline_parallel_degree": 1,
"sharding": "stage2",
"zero_padding": false,
"unified_checkpoint": true,
"use_flash_attention": true
}
8 changes: 8 additions & 0 deletions paddlenlp/trainer/training_args.py
Original file line number Diff line number Diff line change
Expand Up @@ -882,6 +882,14 @@ class TrainingArguments:
default=False,
metadata={"help": "Enable MoE (Mixture of Experts) expert parallel training"},
)
expert_max_capacity: Optional[int] = field(
default=pow(2, 32),
metadata={"help": "Enable MoE (Mixture of Experts) expert max token capacity"},
)
expert_min_capacity: Optional[int] = field(
default=1,
metadata={"help": "Enable MoE (Mixture of Experts) expert min token capacity"},
)
release_grads: Optional[bool] = field(
default=False, metadata={"help": "Whether to release gradients during training. Default is `False`."}
)
Expand Down
2 changes: 2 additions & 0 deletions paddlenlp/transformers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,8 @@
from .attention_utils import create_bigbird_rand_mask_idx_list
from .sequence_parallel_utils import AllGatherVarlenOp, sequence_parallel_sparse_mask_labels
from .tensor_parallel_utils import parallel_matmul, parallel_linear, fused_head_and_loss_fn
from .moe_gate import *
from .moe_layer import *

try:
from paddle.distributed.fleet.utils.sequence_parallel_utils import (
Expand Down
2 changes: 2 additions & 0 deletions paddlenlp/transformers/deepseek_v2/modeling.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@
# See the License for the specific language governing permissions and
# limitations under the License.
""" Paddle DeepSeek model."""
from __future__ import annotations

import math
import warnings
from functools import partial
Expand Down
Loading
Loading