From 7aea46a6767144f71d7e4cb95d3ffc7c6bee76e3 Mon Sep 17 00:00:00 2001 From: Xujinming Date: Tue, 28 May 2024 21:08:00 +0800 Subject: [PATCH] PaddleAPEX monitor Llama-13b --- llm/llama/npu/llama_8card_dump_setting.sh | 94 +++++++++++++++++++++++ paddlenlp/transformers/llama/modeling.py | 15 ++++ 2 files changed, 109 insertions(+) create mode 100644 llm/llama/npu/llama_8card_dump_setting.sh diff --git a/llm/llama/npu/llama_8card_dump_setting.sh b/llm/llama/npu/llama_8card_dump_setting.sh new file mode 100644 index 000000000000..fcec2289a2ab --- /dev/null +++ b/llm/llama/npu/llama_8card_dump_setting.sh @@ -0,0 +1,94 @@ +# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# max_steps=${1:-800} + +set -x +ps aux | grep run_pretrain.py | grep -v grep | awk '{print $2}' | xargs kill -9 +rm -rf ./log_8.0 +rm -rf output +export PYTHONPATH=../../../:$PYTHONPATH +export MC2=0 +export GLOG_v=0 +export FLAGS_npu_storage_format=1 +export HCCL_INTRA_PCIE_EHABLE=0 +export HCCL_INTRA_ROCE_ENABLE=1 +export FLAGS_allocator_strategy=naive_best_fit +export ASCEND_RT_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" +export FLAGS_NPU_MC2=1 +export MC2_Recompute=1 +unset PADDLE_TRAINER_ENDPOINTS +unset DISTRIBUTED_TRAINER_ENDPOINTS + +export FLAGS_use_stride_kernel=0 +export HCCL_OP_BASE_FFTS_MODE_ENABLE=TRUE +export MULTI_STREAM_MEMORY_REUSE=1 + +source /usr/local/Ascend/ascend-toolkit/set_env.sh + + +export APEX_CONFIG_PATH=/root/paddlejob/workspace/paddleNLP/0524/PaddleNLP/paddlenlp/PaddleAPEX/Acc/configs/tool_config.yaml + +python -u -m paddle.distributed.launch \ + --log_dir "./log_8.0" \ + ../run_pretrain.py \ + --model_name_or_path "meta-llama/Llama-2-13b" \ + --tokenizer_name_or_path "meta-llama/Llama-2-13b" \ + --input_dir "./pre-data" \ + --output_dir "./output" \ + --split 949,50,1 \ + --max_seq_length 4096 \ + --per_device_train_batch_size 2 \ + --gradient_accumulation_steps 32 \ + --per_device_eval_batch_size 1 \ + --use_flash_attention 0 \ + --use_fused_rms_norm 0 \ + --virtual_pp_degree 1 \ + --learning_rate 0.00001 \ + --min_learning_rate 0.000001 \ + --max_steps 100 \ + --decay_steps 2000 \ + --save_steps 2000 \ + --seed 100 \ + --weight_decay 0.01 \ + --warmup_steps 20 \ + --max_grad_norm 1.0 \ + --logging_steps 1 \ + --dataloader_num_workers 1 \ + --eval_steps 1001 \ + --tensor_parallel_degree 4 \ + --disable_tqdm true \ + --continue_training 0 \ + --do_train \ + --device "npu" \ + --enable_linear_fused_grad_add false \ + --fuse_attention_qkv true \ + --fuse_attention_ffn true \ + --use_fused_rope true \ + --recompute_use_reentrant true \ + --data_cache "./data_cache" \ + --bf16 \ + --fp16_opt_level "O2" \ + --amp_master_grad \ + --load_sharded_model true \ + --save_sharded_model true \ + --pipeline_parallel_degree 1 \ + --ignore_data_skip 0 \ + --force_reshard_pp true \ + --tensor_parallel_config "enable_mp_async_allreduce enable_mp_skip_c_identity" \ + --sequence_parallel 1 \ + --pipeline_parallel_config "disable_partial_send_recv" \ + --sharding "stage1" \ + --sharding_parallel_degree 2 \ + --recompute true diff --git a/paddlenlp/transformers/llama/modeling.py b/paddlenlp/transformers/llama/modeling.py index 535add40fcfc..3caa08608a8a 100755 --- a/paddlenlp/transformers/llama/modeling.py +++ b/paddlenlp/transformers/llama/modeling.py @@ -1353,7 +1353,13 @@ class LlamaModel(LlamaPretrainedModel): """ def __init__(self, config: LlamaConfig): + ##################################### + from paddlenlp.PaddleAPEX import Acc + self.checker = Acc() + ##################################### super().__init__(config) + + self.vocab_size = config.vocab_size self.hidden_size = config.hidden_size self.sequence_parallel = config.sequence_parallel @@ -1471,6 +1477,10 @@ def forward( return_dict=False, **kwargs, ): + ##################################### + self.checker.start() + ##################################### + if self.sequence_parallel and use_cache: raise ValueError("We currently only support sequence parallel without cache.") @@ -1615,6 +1625,11 @@ def forward( next_cache = next_decoder_cache if use_cache else None + + ##################################### + self.checker.stop() + ##################################### + if not return_dict: return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None) return BaseModelOutputWithPastAndCrossAttentions(