Skip to content

Commit

Permalink
PaddleAPEX monitor Llama-13b
Browse files Browse the repository at this point in the history
  • Loading branch information
CannotBeFatAnyMore committed May 29, 2024
1 parent c1cfe63 commit 7aea46a
Show file tree
Hide file tree
Showing 2 changed files with 109 additions and 0 deletions.
94 changes: 94 additions & 0 deletions llm/llama/npu/llama_8card_dump_setting.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# max_steps=${1:-800}

set -x
ps aux | grep run_pretrain.py | grep -v grep | awk '{print $2}' | xargs kill -9
rm -rf ./log_8.0
rm -rf output
export PYTHONPATH=../../../:$PYTHONPATH
export MC2=0
export GLOG_v=0
export FLAGS_npu_storage_format=1
export HCCL_INTRA_PCIE_EHABLE=0
export HCCL_INTRA_ROCE_ENABLE=1
export FLAGS_allocator_strategy=naive_best_fit
export ASCEND_RT_VISIBLE_DEVICES="0,1,2,3,4,5,6,7"
export FLAGS_NPU_MC2=1
export MC2_Recompute=1
unset PADDLE_TRAINER_ENDPOINTS
unset DISTRIBUTED_TRAINER_ENDPOINTS

export FLAGS_use_stride_kernel=0
export HCCL_OP_BASE_FFTS_MODE_ENABLE=TRUE
export MULTI_STREAM_MEMORY_REUSE=1

source /usr/local/Ascend/ascend-toolkit/set_env.sh


export APEX_CONFIG_PATH=/root/paddlejob/workspace/paddleNLP/0524/PaddleNLP/paddlenlp/PaddleAPEX/Acc/configs/tool_config.yaml

python -u -m paddle.distributed.launch \
--log_dir "./log_8.0" \
../run_pretrain.py \
--model_name_or_path "meta-llama/Llama-2-13b" \
--tokenizer_name_or_path "meta-llama/Llama-2-13b" \
--input_dir "./pre-data" \
--output_dir "./output" \
--split 949,50,1 \
--max_seq_length 4096 \
--per_device_train_batch_size 2 \
--gradient_accumulation_steps 32 \
--per_device_eval_batch_size 1 \
--use_flash_attention 0 \
--use_fused_rms_norm 0 \
--virtual_pp_degree 1 \
--learning_rate 0.00001 \
--min_learning_rate 0.000001 \
--max_steps 100 \
--decay_steps 2000 \
--save_steps 2000 \
--seed 100 \
--weight_decay 0.01 \
--warmup_steps 20 \
--max_grad_norm 1.0 \
--logging_steps 1 \
--dataloader_num_workers 1 \
--eval_steps 1001 \
--tensor_parallel_degree 4 \
--disable_tqdm true \
--continue_training 0 \
--do_train \
--device "npu" \
--enable_linear_fused_grad_add false \
--fuse_attention_qkv true \
--fuse_attention_ffn true \
--use_fused_rope true \
--recompute_use_reentrant true \
--data_cache "./data_cache" \
--bf16 \
--fp16_opt_level "O2" \
--amp_master_grad \
--load_sharded_model true \
--save_sharded_model true \
--pipeline_parallel_degree 1 \
--ignore_data_skip 0 \
--force_reshard_pp true \
--tensor_parallel_config "enable_mp_async_allreduce enable_mp_skip_c_identity" \
--sequence_parallel 1 \
--pipeline_parallel_config "disable_partial_send_recv" \
--sharding "stage1" \
--sharding_parallel_degree 2 \
--recompute true
15 changes: 15 additions & 0 deletions paddlenlp/transformers/llama/modeling.py
Original file line number Diff line number Diff line change
Expand Up @@ -1353,7 +1353,13 @@ class LlamaModel(LlamaPretrainedModel):
"""

def __init__(self, config: LlamaConfig):
#####################################
from paddlenlp.PaddleAPEX import Acc
self.checker = Acc()
#####################################
super().__init__(config)


self.vocab_size = config.vocab_size
self.hidden_size = config.hidden_size
self.sequence_parallel = config.sequence_parallel
Expand Down Expand Up @@ -1471,6 +1477,10 @@ def forward(
return_dict=False,
**kwargs,
):
#####################################
self.checker.start()
#####################################

if self.sequence_parallel and use_cache:
raise ValueError("We currently only support sequence parallel without cache.")

Expand Down Expand Up @@ -1615,6 +1625,11 @@ def forward(

next_cache = next_decoder_cache if use_cache else None


#####################################
self.checker.stop()
#####################################

if not return_dict:
return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
return BaseModelOutputWithPastAndCrossAttentions(
Expand Down

0 comments on commit 7aea46a

Please sign in to comment.