From 7c1c9baec71836e71bd495cb3bebccd8dac66431 Mon Sep 17 00:00:00 2001 From: Liujie0926 <44688141+Liujie0926@users.noreply.github.com> Date: Tue, 21 Jan 2025 15:19:36 +0800 Subject: [PATCH] [LLM Benchmark]update scripts (#9722) * add no_proxy & del paddlenlp_ops * update timeout for dpo * fix sequence_parallel * add timeout * add Total_Tokens_per_second_per_gpu * fix Tokens_per_second_per_gpu * update Total_Tokens_per_second_per_gpu --- fix_time | 0 .../benchmark_json/llama2-70b/dpo.json | 2 +- .../llm/llama2/benchmark_common/prepare.sh | 6 +++--- .../llm/llama2/benchmark_common/run_benchmark.sh | 15 +++++++++++---- .../benchmark_json/qwen-qwen2_5-72b/dpo.json | 2 +- .../llm/qwen2_5/benchmark_common/prepare.sh | 6 +++--- .../llm/qwen2_5/benchmark_common/run_benchmark.sh | 15 +++++++++++---- 7 files changed, 30 insertions(+), 16 deletions(-) create mode 100644 fix_time diff --git a/fix_time b/fix_time new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/tests/test_tipc/llm/llama2/benchmark_common/benchmark_json/llama2-70b/dpo.json b/tests/test_tipc/llm/llama2/benchmark_common/benchmark_json/llama2-70b/dpo.json index c95540903f4e..979009513c98 100644 --- a/tests/test_tipc/llm/llama2/benchmark_common/benchmark_json/llama2-70b/dpo.json +++ b/tests/test_tipc/llm/llama2/benchmark_common/benchmark_json/llama2-70b/dpo.json @@ -15,7 +15,7 @@ "max_seq_len": 4096, "max_prompt_len": 2048, "pipeline_parallel_config": "disable_partial_send_recv enable_clear_every_step_cache", - "sequence_parallel": 1, + "sequence_parallel": 0, "bf16": true, "fp16_opt_level": "O2", "do_train": true, diff --git a/tests/test_tipc/llm/llama2/benchmark_common/prepare.sh b/tests/test_tipc/llm/llama2/benchmark_common/prepare.sh index a1b917731589..ccfd8b76b67e 100644 --- a/tests/test_tipc/llm/llama2/benchmark_common/prepare.sh +++ b/tests/test_tipc/llm/llama2/benchmark_common/prepare.sh @@ -24,9 +24,9 @@ python setup.py install cd - # install paddlenlp_ops -cd ../csrc/ -python setup_cuda.py install -cd - +# cd ../csrc/ +# python setup_cuda.py install +# cd - cd ../llm cp -r ../tests/test_tipc/llm/llama2/benchmark_common/benchmark_json ./ diff --git a/tests/test_tipc/llm/llama2/benchmark_common/run_benchmark.sh b/tests/test_tipc/llm/llama2/benchmark_common/run_benchmark.sh index 9e0c259520b2..32a03ee11a55 100644 --- a/tests/test_tipc/llm/llama2/benchmark_common/run_benchmark.sh +++ b/tests/test_tipc/llm/llama2/benchmark_common/run_benchmark.sh @@ -36,7 +36,7 @@ function _set_params(){ skip_steps=0 # (必选)解析日志,跳过模型前几个性能不稳定的step keyword="Effective_Tokens_per_second_per_gpu:" # (必选)解析日志,筛选出性能数据所在行的关键字 is_large_model=True # (可选)普通模型默认为False,如果添加大模型且只取一条ips设置为True - convergence_key="loss:" # (可选)解析日志,筛选出收敛数据所在行的关键字 如:convergence_key="loss:" + convergence_key="Total_Tokens_per_second_per_gpu:" # (可选)解析日志,筛选出收敛数据所在行的关键字 如:convergence_key="loss:" fp_item="bf16" # 以下为通用执行命令,无特殊可不用修改 @@ -105,18 +105,25 @@ function _train(){ ;; esac cd ../llm/ + export no_proxy=bcebos.com echo "train_cmd: ${train_cmd} log_file: ${log_file}" python -c "import paddlenlp" if [[ ${model_name_or_path} =~ "CE" ]];then # CE精度-不限制执行时间 ${train_cmd} > ${log_file} 2>&1 else - timeout 30m ${train_cmd} > ${log_file} 2>&1 + timeout 60m ${train_cmd} > ${log_file} 2>&1 # echo ${train_cmd} Effective_Tokens_per_second=`cat ${log_file} | grep -E 'Effective_Tokens_per_second|Effective tokens per second:' \ |awk -F': ' '{print $2}' |awk -F' ' '{print $1}'` num_gpu=$(echo "$device_num" | sed 's/^.*C//') - ips=$(awk -v a="$Effective_Tokens_per_second" -v b="$num_gpu" 'BEGIN {printf "%.2f\n", a / b}') - echo "Effective_Tokens_per_second_per_gpu: ${ips}" >> ${log_file} + Effective_Tokens_per_second_per_gpu=$(awk -v a="$Effective_Tokens_per_second" -v b="$num_gpu" 'BEGIN {printf "%.2f\n", a / b}') + echo "Effective_Tokens_per_second_per_gpu: ${Effective_Tokens_per_second_per_gpu}" >> ${log_file} + Train_samples_per_second=`cat ${log_file} | grep 'train_samples_per_second' \ + |awk -F'train_samples_per_second: ' '{print $2}' |awk -F', ' '{print $1}'` + length=4096 + Total_Tokens_per_second=$(awk -v a="$Train_samples_per_second" -v b="$length" 'BEGIN {printf "%.2f\n", a * b}') + Total_Tokens_per_second_per_gpu=$(awk -v a="$Total_Tokens_per_second" -v b="$num_gpu" 'BEGIN {printf "%.2f\n", a / b}') + echo "Total_Tokens_per_second_per_gpu: ${Total_Tokens_per_second_per_gpu}" >> ${log_file} fi if [ $? -ne 0 ];then echo -e "${model_name}, FAIL" diff --git a/tests/test_tipc/llm/qwen2_5/benchmark_common/benchmark_json/qwen-qwen2_5-72b/dpo.json b/tests/test_tipc/llm/qwen2_5/benchmark_common/benchmark_json/qwen-qwen2_5-72b/dpo.json index 0972a78141c0..e78e43649fff 100644 --- a/tests/test_tipc/llm/qwen2_5/benchmark_common/benchmark_json/qwen-qwen2_5-72b/dpo.json +++ b/tests/test_tipc/llm/qwen2_5/benchmark_common/benchmark_json/qwen-qwen2_5-72b/dpo.json @@ -15,7 +15,7 @@ "max_seq_len": 4096, "max_prompt_len": 2048, "pipeline_parallel_config": "disable_partial_send_recv enable_clear_every_step_cache", - "sequence_parallel": 1, + "sequence_parallel": 0, "bf16": true, "fp16_opt_level": "O2", "do_train": true, diff --git a/tests/test_tipc/llm/qwen2_5/benchmark_common/prepare.sh b/tests/test_tipc/llm/qwen2_5/benchmark_common/prepare.sh index 416ded186efb..92d9f0a5061f 100644 --- a/tests/test_tipc/llm/qwen2_5/benchmark_common/prepare.sh +++ b/tests/test_tipc/llm/qwen2_5/benchmark_common/prepare.sh @@ -24,9 +24,9 @@ python setup.py install cd - # install paddlenlp_ops -cd ../csrc/ -python setup_cuda.py install -cd - +# cd ../csrc/ +# python setup_cuda.py install +# cd - cd ../llm cp -r ../tests/test_tipc/llm/qwen2_5/benchmark_common/benchmark_json ./ diff --git a/tests/test_tipc/llm/qwen2_5/benchmark_common/run_benchmark.sh b/tests/test_tipc/llm/qwen2_5/benchmark_common/run_benchmark.sh index 9e0c259520b2..32a03ee11a55 100644 --- a/tests/test_tipc/llm/qwen2_5/benchmark_common/run_benchmark.sh +++ b/tests/test_tipc/llm/qwen2_5/benchmark_common/run_benchmark.sh @@ -36,7 +36,7 @@ function _set_params(){ skip_steps=0 # (必选)解析日志,跳过模型前几个性能不稳定的step keyword="Effective_Tokens_per_second_per_gpu:" # (必选)解析日志,筛选出性能数据所在行的关键字 is_large_model=True # (可选)普通模型默认为False,如果添加大模型且只取一条ips设置为True - convergence_key="loss:" # (可选)解析日志,筛选出收敛数据所在行的关键字 如:convergence_key="loss:" + convergence_key="Total_Tokens_per_second_per_gpu:" # (可选)解析日志,筛选出收敛数据所在行的关键字 如:convergence_key="loss:" fp_item="bf16" # 以下为通用执行命令,无特殊可不用修改 @@ -105,18 +105,25 @@ function _train(){ ;; esac cd ../llm/ + export no_proxy=bcebos.com echo "train_cmd: ${train_cmd} log_file: ${log_file}" python -c "import paddlenlp" if [[ ${model_name_or_path} =~ "CE" ]];then # CE精度-不限制执行时间 ${train_cmd} > ${log_file} 2>&1 else - timeout 30m ${train_cmd} > ${log_file} 2>&1 + timeout 60m ${train_cmd} > ${log_file} 2>&1 # echo ${train_cmd} Effective_Tokens_per_second=`cat ${log_file} | grep -E 'Effective_Tokens_per_second|Effective tokens per second:' \ |awk -F': ' '{print $2}' |awk -F' ' '{print $1}'` num_gpu=$(echo "$device_num" | sed 's/^.*C//') - ips=$(awk -v a="$Effective_Tokens_per_second" -v b="$num_gpu" 'BEGIN {printf "%.2f\n", a / b}') - echo "Effective_Tokens_per_second_per_gpu: ${ips}" >> ${log_file} + Effective_Tokens_per_second_per_gpu=$(awk -v a="$Effective_Tokens_per_second" -v b="$num_gpu" 'BEGIN {printf "%.2f\n", a / b}') + echo "Effective_Tokens_per_second_per_gpu: ${Effective_Tokens_per_second_per_gpu}" >> ${log_file} + Train_samples_per_second=`cat ${log_file} | grep 'train_samples_per_second' \ + |awk -F'train_samples_per_second: ' '{print $2}' |awk -F', ' '{print $1}'` + length=4096 + Total_Tokens_per_second=$(awk -v a="$Train_samples_per_second" -v b="$length" 'BEGIN {printf "%.2f\n", a * b}') + Total_Tokens_per_second_per_gpu=$(awk -v a="$Total_Tokens_per_second" -v b="$num_gpu" 'BEGIN {printf "%.2f\n", a / b}') + echo "Total_Tokens_per_second_per_gpu: ${Total_Tokens_per_second_per_gpu}" >> ${log_file} fi if [ $? -ne 0 ];then echo -e "${model_name}, FAIL"