PaddleAPEX monitor Llama-13b

PaddlePaddle · Jun 11, 2024 · 2e5af24 · 2e5af24
1 parent c1cfe63
commit 2e5af24
Show file tree

Hide file tree

Showing 3 changed files with 115 additions and 5 deletions.
diff --git a/llm/llama/npu/llama_8card_dump_setting.sh b/llm/llama/npu/llama_8card_dump_setting.sh
@@ -0,0 +1,94 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# max_steps=${1:-800}
+
+set -x
+ps aux | grep run_pretrain.py | grep -v grep | awk '{print $2}' | xargs kill -9
+rm -rf ./log_8.0
+rm -rf output
+export PYTHONPATH=../../../:$PYTHONPATH
+export MC2=0
+export GLOG_v=0
+export FLAGS_npu_storage_format=1
+export HCCL_INTRA_PCIE_EHABLE=0
+export HCCL_INTRA_ROCE_ENABLE=1
+export FLAGS_allocator_strategy=naive_best_fit
+export ASCEND_RT_VISIBLE_DEVICES="0,1,2,3,4,5,6,7"
+export FLAGS_NPU_MC2=1
+export MC2_Recompute=1
+unset PADDLE_TRAINER_ENDPOINTS
+unset DISTRIBUTED_TRAINER_ENDPOINTS
+
+export FLAGS_use_stride_kernel=0
+export HCCL_OP_BASE_FFTS_MODE_ENABLE=TRUE
+export MULTI_STREAM_MEMORY_REUSE=1
+
+source /usr/local/Ascend/ascend-toolkit/set_env.sh
+
+
+export APEX_CONFIG_PATH=/root/paddlejob/workspace/paddleNLP/0524/PaddleNLP/paddlenlp/PaddleAPEX/Acc/configs/tool_config.yaml
+
+python -u  -m paddle.distributed.launch \
+    --log_dir "./log_8.0" \
+    ../run_pretrain.py \
+    --model_name_or_path "meta-llama/Llama-2-13b" \
+    --tokenizer_name_or_path "meta-llama/Llama-2-13b" \
+    --input_dir "./pre-data" \
+    --output_dir "./output" \
+    --split 949,50,1 \
+    --max_seq_length 4096 \
+    --per_device_train_batch_size 2 \
+    --gradient_accumulation_steps 32 \
+    --per_device_eval_batch_size 1 \
+    --use_flash_attention 0 \
+    --use_fused_rms_norm 0 \
+    --virtual_pp_degree 1 \
+    --learning_rate 0.00001 \
+    --min_learning_rate 0.000001 \
+    --max_steps 100 \
+    --decay_steps 2000 \
+    --save_steps 2000 \
+    --seed 100 \
+    --weight_decay 0.01 \
+    --warmup_steps 20 \
+    --max_grad_norm 1.0 \
+    --logging_steps 1 \
+    --dataloader_num_workers 1 \
+    --eval_steps 1001 \
+    --tensor_parallel_degree 4 \
+    --disable_tqdm true \
+    --continue_training 0 \
+    --do_train \
+    --device "npu" \
+    --enable_linear_fused_grad_add false \
+    --fuse_attention_qkv true \
+    --fuse_attention_ffn true \
+    --use_fused_rope true \
+    --recompute_use_reentrant true \
+    --data_cache "./data_cache" \
+    --bf16 \
+    --fp16_opt_level "O2" \
+    --amp_master_grad \
+    --load_sharded_model true \
+    --save_sharded_model true \
+    --pipeline_parallel_degree 1 \
+    --ignore_data_skip 0 \
+    --force_reshard_pp true \
+    --tensor_parallel_config "enable_mp_async_allreduce enable_mp_skip_c_identity" \
+    --sequence_parallel 1 \
+    --pipeline_parallel_config "disable_partial_send_recv" \
+    --sharding "stage1" \
+    --sharding_parallel_degree 2 \
+    --recompute true
diff --git a/paddlenlp/transformers/llama/fusion_ops.py b/paddlenlp/transformers/llama/fusion_ops.py
@@ -197,10 +197,10 @@ def fusion_flash_attention(
         )
         # attn_output shape: [bs, seqlen/sep, num_head, head_dim]
         assert (
-            config.sep_parallel_degree > 1 and q_len % config.sep_parallel_degree == 0
-        ), f"q_len:{q_len}, config.sep_parallel_degree:{config.sep_parallel_degree}"
-        q_len = q_len // config.sep_parallel_degree
-        num_heads = num_heads * config.sep_parallel_degree
+            config['sep_parallel_degree'] > 1 and q_len % config['sep_parallel_degree'] == 0
+        ), f"q_len:{q_len}, config.sep_parallel_degree:{config['sep_parallel_degree']}"
+        q_len = q_len // config['sep_parallel_degree']
+        num_heads = num_heads * config['sep_parallel_degree']
 
     if sequence_parallel:
         attn_output = attn_output.reshape([bsz * q_len, head_dim * num_heads])

diff --git a/paddlenlp/transformers/llama/modeling.py b/paddlenlp/transformers/llama/modeling.py
@@ -216,9 +216,10 @@ def scaled_dot_product_attention(
     _, kv_seq_len, _, _ = value_states.shape
 
     if config.use_flash_attention and flash_attention:
+        FA_cfg = {"sep_parallel_degree": config.sep_parallel_degree}
         return fusion_ops.fusion_flash_attention(
             query_states,
-            config,
+            FA_cfg,
             key_states,
             value_states,
             attention_mask,
@@ -1353,7 +1354,13 @@ class LlamaModel(LlamaPretrainedModel):
     """
 
     def __init__(self, config: LlamaConfig):
+        #####################################
+        from paddlenlp.PaddleAPEX import Acc
+        self.checker = Acc()
+        #####################################
         super().__init__(config)
+
+
         self.vocab_size = config.vocab_size
         self.hidden_size = config.hidden_size
         self.sequence_parallel = config.sequence_parallel
@@ -1471,6 +1478,10 @@ def forward(
         return_dict=False,
         **kwargs,
     ):
+        #####################################
+        self.checker.start()
+        #####################################
+
         if self.sequence_parallel and use_cache:
             raise ValueError("We currently only support sequence parallel without cache.")
 
@@ -1615,6 +1626,11 @@ def forward(
 
         next_cache = next_decoder_cache if use_cache else None
 
+
+        #####################################
+        self.checker.stop()
+        #####################################
+
         if not return_dict:
             return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
         return BaseModelOutputWithPastAndCrossAttentions(