diff --git a/tests/test_tipc/llm/llama2/N4C32/llama2-70b_dpo_bs16_bf16_tp8_pp1_sd4_acc32_dygraph.sh b/tests/test_tipc/llm/llama2/N4C32/llama2-70b_dpo_bs16_bf16_tp8_pp4_sd1_acc32_dygraph.sh similarity index 93% rename from tests/test_tipc/llm/llama2/N4C32/llama2-70b_dpo_bs16_bf16_tp8_pp1_sd4_acc32_dygraph.sh rename to tests/test_tipc/llm/llama2/N4C32/llama2-70b_dpo_bs16_bf16_tp8_pp4_sd1_acc32_dygraph.sh index 6f1e9fc8e7f1..38cda1cbeb56 100644 --- a/tests/test_tipc/llm/llama2/N4C32/llama2-70b_dpo_bs16_bf16_tp8_pp1_sd4_acc32_dygraph.sh +++ b/tests/test_tipc/llm/llama2/N4C32/llama2-70b_dpo_bs16_bf16_tp8_pp4_sd1_acc32_dygraph.sh @@ -16,8 +16,8 @@ param="model_name_or_path=meta-llama/Llama-2-70b " param+="per_device_train_batch_size=1 " param+="tensor_parallel_degree=8 " -param+="pipeline_parallel_degree=1 " -param+="sharding_parallel_degree=4 " +param+="pipeline_parallel_degree=4 " +param+="sharding_parallel_degree=1 " param+="gradient_accumulation_steps=32 " param+="run_stage=dpo " param+="run_mode=tp8_pp1_sd4_acc32_dygraph " diff --git a/tests/test_tipc/llm/llama2/N4C32/llama2-70b_lora_bs16_bf16_tp4_pp1_acc4_dygraph.sh b/tests/test_tipc/llm/llama2/N4C32/llama2-70b_lora_bs16_bf16_tp8_pp4_acc32_dygraph.sh similarity index 89% rename from tests/test_tipc/llm/llama2/N4C32/llama2-70b_lora_bs16_bf16_tp4_pp1_acc4_dygraph.sh rename to tests/test_tipc/llm/llama2/N4C32/llama2-70b_lora_bs16_bf16_tp8_pp4_acc32_dygraph.sh index 2f5235b39c2c..dde5d4d5a61c 100644 --- a/tests/test_tipc/llm/llama2/N4C32/llama2-70b_lora_bs16_bf16_tp4_pp1_acc4_dygraph.sh +++ b/tests/test_tipc/llm/llama2/N4C32/llama2-70b_lora_bs16_bf16_tp8_pp4_acc32_dygraph.sh @@ -15,9 +15,9 @@ param="model_name_or_path=meta-llama/Llama-2-70b " param+="per_device_train_batch_size=1 " -param+="tensor_parallel_degree=4 " -param+="pipeline_parallel_degree=1 " -param+="gradient_accumulation_steps=4 " +param+="tensor_parallel_degree=8 " +param+="pipeline_parallel_degree=4 " +param+="gradient_accumulation_steps=32 " param+="run_stage=lora " param+="run_mode=tp4_pp1_acc4_dygraph " param+="device_num=N4C32 " diff --git a/tests/test_tipc/llm/llama2/benchmark_common/benchmark_json/llama2-70b/dpo.json b/tests/test_tipc/llm/llama2/benchmark_common/benchmark_json/llama2-70b/dpo.json index 4ca9c223d2de..c90f058e1f0b 100644 --- a/tests/test_tipc/llm/llama2/benchmark_common/benchmark_json/llama2-70b/dpo.json +++ b/tests/test_tipc/llm/llama2/benchmark_common/benchmark_json/llama2-70b/dpo.json @@ -21,13 +21,13 @@ "disable_tqdm": true, "load_best_model_at_end": true, "tensor_parallel_degree": 8, - "sharding_parallel_degree": 4, - "pipeline_parallel_degree": 1, + "sharding_parallel_degree": 1, + "pipeline_parallel_degree": 4, "sharding": "stage2", "use_flash_attention": true, "flash_mask": true, "recompute": true, - "recompute_granularity": "full_attn", + "recompute_granularity": "full", "benchmark": true, "unified_checkpoint": true, "autotuner_benchmark":false, diff --git a/tests/test_tipc/llm/llama2/benchmark_common/benchmark_json/llama2-70b/lora.json b/tests/test_tipc/llm/llama2/benchmark_common/benchmark_json/llama2-70b/lora.json index f3efc69b6cb0..f2101b1dc48d 100644 --- a/tests/test_tipc/llm/llama2/benchmark_common/benchmark_json/llama2-70b/lora.json +++ b/tests/test_tipc/llm/llama2/benchmark_common/benchmark_json/llama2-70b/lora.json @@ -3,7 +3,7 @@ "dataset_name_or_path": "./data/sft_benchmark_train/", "output_dir": "./checkpoints/lora_ckpts", "per_device_train_batch_size": 1, - "gradient_accumulation_steps": 4, + "gradient_accumulation_steps": 32, "per_device_eval_batch_size": 8, "eval_accumulation_steps":16, "num_train_epochs": 1, @@ -30,11 +30,12 @@ "load_best_model_at_end": true, "eval_with_do_generation": false, "metric_for_best_model": "accuracy", - "recompute": false, + "recompute": true, "recompute_granularity": "full", "save_total_limit": 1, - "tensor_parallel_degree": 4, - "pipeline_parallel_degree": 1, + "tensor_parallel_output": true, + "tensor_parallel_degree": 8, + "pipeline_parallel_degree": 4, "lora": true, "unified_checkpoint": true, "benchmark": true, diff --git a/tests/test_tipc/llm/llama2/benchmark_common/benchmark_json/llama2-70b/sft.json b/tests/test_tipc/llm/llama2/benchmark_common/benchmark_json/llama2-70b/sft.json index f3cddd5cbc81..186961130ae8 100644 --- a/tests/test_tipc/llm/llama2/benchmark_common/benchmark_json/llama2-70b/sft.json +++ b/tests/test_tipc/llm/llama2/benchmark_common/benchmark_json/llama2-70b/sft.json @@ -30,7 +30,7 @@ "load_best_model_at_end": true, "eval_with_do_generation": false, "metric_for_best_model": "accuracy", - "recompute": false, + "recompute": true, "recompute_granularity": "full", "save_total_limit": 1, "benchmark": true, diff --git a/tests/test_tipc/llm/qwen2_5/N4C32/qwen-qwen2_5-72b_dpo_bs16_bf16_tp8_sd4_acc4_dygraph.sh b/tests/test_tipc/llm/qwen2_5/N4C32/qwen-qwen2_5-72b_dpo_bs16_bf16_tp8_pp4_acc32_dygraph.sh similarity index 93% rename from tests/test_tipc/llm/qwen2_5/N4C32/qwen-qwen2_5-72b_dpo_bs16_bf16_tp8_sd4_acc4_dygraph.sh rename to tests/test_tipc/llm/qwen2_5/N4C32/qwen-qwen2_5-72b_dpo_bs16_bf16_tp8_pp4_acc32_dygraph.sh index e7940e7d9999..5c7b9d0c68eb 100644 --- a/tests/test_tipc/llm/qwen2_5/N4C32/qwen-qwen2_5-72b_dpo_bs16_bf16_tp8_sd4_acc4_dygraph.sh +++ b/tests/test_tipc/llm/qwen2_5/N4C32/qwen-qwen2_5-72b_dpo_bs16_bf16_tp8_pp4_acc32_dygraph.sh @@ -16,8 +16,8 @@ param="model_name_or_path=Qwen/Qwen2.5-72B " param+="per_device_train_batch_size=1 " param+="tensor_parallel_degree=8 " -param+="sharding_parallel_degree=4 " -param+="gradient_accumulation_steps=4 " +param+="pipeline_parallel_degree=4 " +param+="gradient_accumulation_steps=32 " param+="run_stage=dpo " param+="run_mode=tp8_sd4_acc4_dygraph " param+="device_num=N4C32 " diff --git a/tests/test_tipc/llm/qwen2_5/benchmark_common/benchmark_json/qwen-qwen2_5-72b/dpo.json b/tests/test_tipc/llm/qwen2_5/benchmark_common/benchmark_json/qwen-qwen2_5-72b/dpo.json index d6e41e0bc52e..2df563143693 100644 --- a/tests/test_tipc/llm/qwen2_5/benchmark_common/benchmark_json/qwen-qwen2_5-72b/dpo.json +++ b/tests/test_tipc/llm/qwen2_5/benchmark_common/benchmark_json/qwen-qwen2_5-72b/dpo.json @@ -4,7 +4,7 @@ "dev_dataset_path": "./data/dpo_benchmark_train/dev.json", "output_dir": "./checkpoints/dpo_ckpts", "per_device_train_batch_size": 1, - "gradient_accumulation_steps": 4, + "gradient_accumulation_steps": 32, "per_device_eval_batch_size": 1, "num_train_epochs": 1, "learning_rate": 1e-06, @@ -22,11 +22,11 @@ "load_best_model_at_end": true, "tensor_parallel_output": true, "tensor_parallel_degree": 8, - "sharding_parallel_degree": 4, + "pipeline_parallel_degree": 4, "sharding": "stage2", "use_flash_attention": true, "flash_mask": true, - "recompute": false, + "recompute": true, "recompute_granularity": "full", "benchmark": true, "unified_checkpoint": true, diff --git a/tests/test_tipc/llm/qwen2_5/benchmark_common/benchmark_json/qwen-qwen2_5-72b/lora.json b/tests/test_tipc/llm/qwen2_5/benchmark_common/benchmark_json/qwen-qwen2_5-72b/lora.json index e340c7258ac4..21815924487a 100644 --- a/tests/test_tipc/llm/qwen2_5/benchmark_common/benchmark_json/qwen-qwen2_5-72b/lora.json +++ b/tests/test_tipc/llm/qwen2_5/benchmark_common/benchmark_json/qwen-qwen2_5-72b/lora.json @@ -30,7 +30,7 @@ "load_best_model_at_end": true, "eval_with_do_generation": false, "metric_for_best_model": "accuracy", - "recompute": false, + "recompute": true, "recompute_granularity": "full", "save_total_limit": 1, "tensor_parallel_degree": 8, diff --git a/tests/test_tipc/llm/qwen2_5/benchmark_common/benchmark_json/qwen-qwen2_5-72b/sft.json b/tests/test_tipc/llm/qwen2_5/benchmark_common/benchmark_json/qwen-qwen2_5-72b/sft.json index 4f901cd74ca4..4e3e43b7a948 100644 --- a/tests/test_tipc/llm/qwen2_5/benchmark_common/benchmark_json/qwen-qwen2_5-72b/sft.json +++ b/tests/test_tipc/llm/qwen2_5/benchmark_common/benchmark_json/qwen-qwen2_5-72b/sft.json @@ -30,7 +30,7 @@ "load_best_model_at_end": true, "eval_with_do_generation": false, "metric_for_best_model": "accuracy", - "recompute": false, + "recompute": true, "recompute_granularity": "full", "save_total_limit": 1, "benchmark": true,