update N4C32 config

PaddlePaddle · Dec 23, 2024 · 15b9336 · 15b9336
1 parent e6a2f30
commit 15b9336
Show file tree

Hide file tree

Showing 9 changed files with 21 additions and 20 deletions.
diff --git a/...po_bs16_bf16_tp8_pp1_sd4_acc32_dygraph.sh → ...po_bs16_bf16_tp8_pp4_sd1_acc32_dygraph.sh b/...po_bs16_bf16_tp8_pp1_sd4_acc32_dygraph.sh → ...po_bs16_bf16_tp8_pp4_sd1_acc32_dygraph.sh
@@ -16,8 +16,8 @@
 param="model_name_or_path=meta-llama/Llama-2-70b "
 param+="per_device_train_batch_size=1 "
 param+="tensor_parallel_degree=8 "
-param+="pipeline_parallel_degree=1 "
-param+="sharding_parallel_degree=4 "
+param+="pipeline_parallel_degree=4 "
+param+="sharding_parallel_degree=1 "
 param+="gradient_accumulation_steps=32 "
 param+="run_stage=dpo "
 param+="run_mode=tp8_pp1_sd4_acc32_dygraph "

diff --git a/...0b_lora_bs16_bf16_tp4_pp1_acc4_dygraph.sh → ...b_lora_bs16_bf16_tp8_pp4_acc32_dygraph.sh b/...0b_lora_bs16_bf16_tp4_pp1_acc4_dygraph.sh → ...b_lora_bs16_bf16_tp8_pp4_acc32_dygraph.sh
@@ -15,9 +15,9 @@
 
 param="model_name_or_path=meta-llama/Llama-2-70b "
 param+="per_device_train_batch_size=1 "
-param+="tensor_parallel_degree=4 "
-param+="pipeline_parallel_degree=1 "
-param+="gradient_accumulation_steps=4 "
+param+="tensor_parallel_degree=8 "
+param+="pipeline_parallel_degree=4 "
+param+="gradient_accumulation_steps=32 "
 param+="run_stage=lora "
 param+="run_mode=tp4_pp1_acc4_dygraph "
 param+="device_num=N4C32 "

diff --git a/tests/test_tipc/llm/llama2/benchmark_common/benchmark_json/llama2-70b/dpo.json b/tests/test_tipc/llm/llama2/benchmark_common/benchmark_json/llama2-70b/dpo.json
@@ -21,13 +21,13 @@
     "disable_tqdm": true,
     "load_best_model_at_end": true,
     "tensor_parallel_degree": 8,
-    "sharding_parallel_degree": 4,
-    "pipeline_parallel_degree": 1,
+    "sharding_parallel_degree": 1,
+    "pipeline_parallel_degree": 4,
     "sharding": "stage2",
     "use_flash_attention": true,
     "flash_mask": true,
     "recompute": true,
-    "recompute_granularity": "full_attn",
+    "recompute_granularity": "full",
     "benchmark": true,
     "unified_checkpoint": true,
     "autotuner_benchmark":false,

diff --git a/tests/test_tipc/llm/llama2/benchmark_common/benchmark_json/llama2-70b/lora.json b/tests/test_tipc/llm/llama2/benchmark_common/benchmark_json/llama2-70b/lora.json
@@ -3,7 +3,7 @@
     "dataset_name_or_path": "./data/sft_benchmark_train/",
     "output_dir": "./checkpoints/lora_ckpts",
     "per_device_train_batch_size": 1,
-    "gradient_accumulation_steps": 4,
+    "gradient_accumulation_steps": 32,
     "per_device_eval_batch_size": 8,
     "eval_accumulation_steps":16,
     "num_train_epochs": 1,
@@ -30,11 +30,12 @@
     "load_best_model_at_end": true,
     "eval_with_do_generation": false,
     "metric_for_best_model": "accuracy",
-    "recompute": false,
+    "recompute": true,
 	"recompute_granularity": "full",
     "save_total_limit": 1,
-    "tensor_parallel_degree": 4,
-    "pipeline_parallel_degree": 1,
+    "tensor_parallel_output": true,
+    "tensor_parallel_degree": 8,
+    "pipeline_parallel_degree": 4,
     "lora": true,
     "unified_checkpoint": true,
 	"benchmark": true,

diff --git a/tests/test_tipc/llm/llama2/benchmark_common/benchmark_json/llama2-70b/sft.json b/tests/test_tipc/llm/llama2/benchmark_common/benchmark_json/llama2-70b/sft.json
@@ -30,7 +30,7 @@
     "load_best_model_at_end": true,
     "eval_with_do_generation": false,
     "metric_for_best_model": "accuracy",
-    "recompute": false,
+    "recompute": true,
 	"recompute_granularity": "full",
     "save_total_limit": 1,
 	"benchmark": true,

diff --git a/...72b_dpo_bs16_bf16_tp8_sd4_acc4_dygraph.sh → ...2b_dpo_bs16_bf16_tp8_pp4_acc32_dygraph.sh b/...72b_dpo_bs16_bf16_tp8_sd4_acc4_dygraph.sh → ...2b_dpo_bs16_bf16_tp8_pp4_acc32_dygraph.sh
@@ -16,8 +16,8 @@
 param="model_name_or_path=Qwen/Qwen2.5-72B "
 param+="per_device_train_batch_size=1 "
 param+="tensor_parallel_degree=8 "
-param+="sharding_parallel_degree=4 "
-param+="gradient_accumulation_steps=4 "
+param+="pipeline_parallel_degree=4 "
+param+="gradient_accumulation_steps=32 "
 param+="run_stage=dpo "
 param+="run_mode=tp8_sd4_acc4_dygraph "
 param+="device_num=N4C32 "

diff --git a/tests/test_tipc/llm/qwen2_5/benchmark_common/benchmark_json/qwen-qwen2_5-72b/dpo.json b/tests/test_tipc/llm/qwen2_5/benchmark_common/benchmark_json/qwen-qwen2_5-72b/dpo.json
@@ -4,7 +4,7 @@
     "dev_dataset_path": "./data/dpo_benchmark_train/dev.json",
     "output_dir": "./checkpoints/dpo_ckpts",
     "per_device_train_batch_size": 1,
-    "gradient_accumulation_steps": 4,
+    "gradient_accumulation_steps": 32,
     "per_device_eval_batch_size": 1,
     "num_train_epochs": 1,
     "learning_rate": 1e-06,
@@ -22,11 +22,11 @@
     "load_best_model_at_end": true,
     "tensor_parallel_output": true,
     "tensor_parallel_degree": 8,
-    "sharding_parallel_degree": 4,
+    "pipeline_parallel_degree": 4,
     "sharding": "stage2",
     "use_flash_attention": true,
     "flash_mask": true,
-    "recompute": false,
+    "recompute": true,
     "recompute_granularity": "full",
     "benchmark": true,
     "unified_checkpoint": true,

diff --git a/tests/test_tipc/llm/qwen2_5/benchmark_common/benchmark_json/qwen-qwen2_5-72b/lora.json b/tests/test_tipc/llm/qwen2_5/benchmark_common/benchmark_json/qwen-qwen2_5-72b/lora.json
@@ -30,7 +30,7 @@
     "load_best_model_at_end": true,
     "eval_with_do_generation": false,
     "metric_for_best_model": "accuracy",
-    "recompute": false,
+    "recompute": true,
 	"recompute_granularity": "full",
     "save_total_limit": 1,
     "tensor_parallel_degree": 8,

diff --git a/tests/test_tipc/llm/qwen2_5/benchmark_common/benchmark_json/qwen-qwen2_5-72b/sft.json b/tests/test_tipc/llm/qwen2_5/benchmark_common/benchmark_json/qwen-qwen2_5-72b/sft.json
@@ -30,7 +30,7 @@
     "load_best_model_at_end": true,
     "eval_with_do_generation": false,
     "metric_for_best_model": "accuracy",
-    "recompute": false,
+    "recompute": true,
 	"recompute_granularity": "full",
     "save_total_limit": 1,
 	"benchmark": true,