PaddleAPEX monitor Llama-13b

PaddlePaddle · May 29, 2024 · 7aea46a · 7aea46a
1 parent c1cfe63
commit 7aea46a
Show file tree

Hide file tree

Showing 2 changed files with 109 additions and 0 deletions.
diff --git a/llm/llama/npu/llama_8card_dump_setting.sh b/llm/llama/npu/llama_8card_dump_setting.sh
@@ -0,0 +1,94 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# max_steps=${1:-800}
+
+set -x
+ps aux | grep run_pretrain.py | grep -v grep | awk '{print $2}' | xargs kill -9
+rm -rf ./log_8.0
+rm -rf output
+export PYTHONPATH=../../../:$PYTHONPATH
+export MC2=0
+export GLOG_v=0
+export FLAGS_npu_storage_format=1
+export HCCL_INTRA_PCIE_EHABLE=0
+export HCCL_INTRA_ROCE_ENABLE=1
+export FLAGS_allocator_strategy=naive_best_fit
+export ASCEND_RT_VISIBLE_DEVICES="0,1,2,3,4,5,6,7"
+export FLAGS_NPU_MC2=1
+export MC2_Recompute=1
+unset PADDLE_TRAINER_ENDPOINTS
+unset DISTRIBUTED_TRAINER_ENDPOINTS
+
+export FLAGS_use_stride_kernel=0
+export HCCL_OP_BASE_FFTS_MODE_ENABLE=TRUE
+export MULTI_STREAM_MEMORY_REUSE=1
+
+source /usr/local/Ascend/ascend-toolkit/set_env.sh
+
+
+export APEX_CONFIG_PATH=/root/paddlejob/workspace/paddleNLP/0524/PaddleNLP/paddlenlp/PaddleAPEX/Acc/configs/tool_config.yaml
+
+python -u  -m paddle.distributed.launch \
+    --log_dir "./log_8.0" \
+    ../run_pretrain.py \
+    --model_name_or_path "meta-llama/Llama-2-13b" \
+    --tokenizer_name_or_path "meta-llama/Llama-2-13b" \
+    --input_dir "./pre-data" \
+    --output_dir "./output" \
+    --split 949,50,1 \
+    --max_seq_length 4096 \
+    --per_device_train_batch_size 2 \
+    --gradient_accumulation_steps 32 \
+    --per_device_eval_batch_size 1 \
+    --use_flash_attention 0 \
+    --use_fused_rms_norm 0 \
+    --virtual_pp_degree 1 \
+    --learning_rate 0.00001 \
+    --min_learning_rate 0.000001 \
+    --max_steps 100 \
+    --decay_steps 2000 \
+    --save_steps 2000 \
+    --seed 100 \
+    --weight_decay 0.01 \
+    --warmup_steps 20 \
+    --max_grad_norm 1.0 \
+    --logging_steps 1 \
+    --dataloader_num_workers 1 \
+    --eval_steps 1001 \
+    --tensor_parallel_degree 4 \
+    --disable_tqdm true \
+    --continue_training 0 \
+    --do_train \
+    --device "npu" \
+    --enable_linear_fused_grad_add false \
+    --fuse_attention_qkv true \
+    --fuse_attention_ffn true \
+    --use_fused_rope true \
+    --recompute_use_reentrant true \
+    --data_cache "./data_cache" \
+    --bf16 \
+    --fp16_opt_level "O2" \
+    --amp_master_grad \
+    --load_sharded_model true \
+    --save_sharded_model true \
+    --pipeline_parallel_degree 1 \
+    --ignore_data_skip 0 \
+    --force_reshard_pp true \
+    --tensor_parallel_config "enable_mp_async_allreduce enable_mp_skip_c_identity" \
+    --sequence_parallel 1 \
+    --pipeline_parallel_config "disable_partial_send_recv" \
+    --sharding "stage1" \
+    --sharding_parallel_degree 2 \
+    --recompute true
diff --git a/paddlenlp/transformers/llama/modeling.py b/paddlenlp/transformers/llama/modeling.py
@@ -1353,7 +1353,13 @@ class LlamaModel(LlamaPretrainedModel):
     """
 
     def __init__(self, config: LlamaConfig):
+        #####################################
+        from paddlenlp.PaddleAPEX import Acc
+        self.checker = Acc()
+        #####################################
         super().__init__(config)
+
+
         self.vocab_size = config.vocab_size
         self.hidden_size = config.hidden_size
         self.sequence_parallel = config.sequence_parallel
@@ -1471,6 +1477,10 @@ def forward(
         return_dict=False,
         **kwargs,
     ):
+        #####################################
+        self.checker.start()
+        #####################################
+
         if self.sequence_parallel and use_cache:
             raise ValueError("We currently only support sequence parallel without cache.")
 
@@ -1615,6 +1625,11 @@ def forward(
 
         next_cache = next_decoder_cache if use_cache else None
 
+
+        #####################################
+        self.checker.stop()
+        #####################################
+
         if not return_dict:
             return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
         return BaseModelOutputWithPastAndCrossAttentions(