Skip to content

Commit

Permalink
PaddleAPEX monitor Llama-13b
Browse files Browse the repository at this point in the history
  • Loading branch information
CannotBeFatAnyMore committed Jun 11, 2024
1 parent c1cfe63 commit 2e5af24
Show file tree
Hide file tree
Showing 3 changed files with 115 additions and 5 deletions.
94 changes: 94 additions & 0 deletions llm/llama/npu/llama_8card_dump_setting.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# max_steps=${1:-800}

set -x
ps aux | grep run_pretrain.py | grep -v grep | awk '{print $2}' | xargs kill -9
rm -rf ./log_8.0
rm -rf output
export PYTHONPATH=../../../:$PYTHONPATH
export MC2=0
export GLOG_v=0
export FLAGS_npu_storage_format=1
export HCCL_INTRA_PCIE_EHABLE=0
export HCCL_INTRA_ROCE_ENABLE=1
export FLAGS_allocator_strategy=naive_best_fit
export ASCEND_RT_VISIBLE_DEVICES="0,1,2,3,4,5,6,7"
export FLAGS_NPU_MC2=1
export MC2_Recompute=1
unset PADDLE_TRAINER_ENDPOINTS
unset DISTRIBUTED_TRAINER_ENDPOINTS

export FLAGS_use_stride_kernel=0
export HCCL_OP_BASE_FFTS_MODE_ENABLE=TRUE
export MULTI_STREAM_MEMORY_REUSE=1

source /usr/local/Ascend/ascend-toolkit/set_env.sh


export APEX_CONFIG_PATH=/root/paddlejob/workspace/paddleNLP/0524/PaddleNLP/paddlenlp/PaddleAPEX/Acc/configs/tool_config.yaml

python -u -m paddle.distributed.launch \
--log_dir "./log_8.0" \
../run_pretrain.py \
--model_name_or_path "meta-llama/Llama-2-13b" \
--tokenizer_name_or_path "meta-llama/Llama-2-13b" \
--input_dir "./pre-data" \
--output_dir "./output" \
--split 949,50,1 \
--max_seq_length 4096 \
--per_device_train_batch_size 2 \
--gradient_accumulation_steps 32 \
--per_device_eval_batch_size 1 \
--use_flash_attention 0 \
--use_fused_rms_norm 0 \
--virtual_pp_degree 1 \
--learning_rate 0.00001 \
--min_learning_rate 0.000001 \
--max_steps 100 \
--decay_steps 2000 \
--save_steps 2000 \
--seed 100 \
--weight_decay 0.01 \
--warmup_steps 20 \
--max_grad_norm 1.0 \
--logging_steps 1 \
--dataloader_num_workers 1 \
--eval_steps 1001 \
--tensor_parallel_degree 4 \
--disable_tqdm true \
--continue_training 0 \
--do_train \
--device "npu" \
--enable_linear_fused_grad_add false \
--fuse_attention_qkv true \
--fuse_attention_ffn true \
--use_fused_rope true \
--recompute_use_reentrant true \
--data_cache "./data_cache" \
--bf16 \
--fp16_opt_level "O2" \
--amp_master_grad \
--load_sharded_model true \
--save_sharded_model true \
--pipeline_parallel_degree 1 \
--ignore_data_skip 0 \
--force_reshard_pp true \
--tensor_parallel_config "enable_mp_async_allreduce enable_mp_skip_c_identity" \
--sequence_parallel 1 \
--pipeline_parallel_config "disable_partial_send_recv" \
--sharding "stage1" \
--sharding_parallel_degree 2 \
--recompute true
8 changes: 4 additions & 4 deletions paddlenlp/transformers/llama/fusion_ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -197,10 +197,10 @@ def fusion_flash_attention(
)
# attn_output shape: [bs, seqlen/sep, num_head, head_dim]
assert (
config.sep_parallel_degree > 1 and q_len % config.sep_parallel_degree == 0
), f"q_len:{q_len}, config.sep_parallel_degree:{config.sep_parallel_degree}"
q_len = q_len // config.sep_parallel_degree
num_heads = num_heads * config.sep_parallel_degree
config['sep_parallel_degree'] > 1 and q_len % config['sep_parallel_degree'] == 0
), f"q_len:{q_len}, config.sep_parallel_degree:{config['sep_parallel_degree']}"
q_len = q_len // config['sep_parallel_degree']
num_heads = num_heads * config['sep_parallel_degree']

if sequence_parallel:
attn_output = attn_output.reshape([bsz * q_len, head_dim * num_heads])
Expand Down
18 changes: 17 additions & 1 deletion paddlenlp/transformers/llama/modeling.py
Original file line number Diff line number Diff line change
Expand Up @@ -216,9 +216,10 @@ def scaled_dot_product_attention(
_, kv_seq_len, _, _ = value_states.shape

if config.use_flash_attention and flash_attention:
FA_cfg = {"sep_parallel_degree": config.sep_parallel_degree}
return fusion_ops.fusion_flash_attention(
query_states,
config,
FA_cfg,
key_states,
value_states,
attention_mask,
Expand Down Expand Up @@ -1353,7 +1354,13 @@ class LlamaModel(LlamaPretrainedModel):
"""

def __init__(self, config: LlamaConfig):
#####################################
from paddlenlp.PaddleAPEX import Acc
self.checker = Acc()
#####################################
super().__init__(config)


self.vocab_size = config.vocab_size
self.hidden_size = config.hidden_size
self.sequence_parallel = config.sequence_parallel
Expand Down Expand Up @@ -1471,6 +1478,10 @@ def forward(
return_dict=False,
**kwargs,
):
#####################################
self.checker.start()
#####################################

if self.sequence_parallel and use_cache:
raise ValueError("We currently only support sequence parallel without cache.")

Expand Down Expand Up @@ -1615,6 +1626,11 @@ def forward(

next_cache = next_decoder_cache if use_cache else None


#####################################
self.checker.stop()
#####################################

if not return_dict:
return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
return BaseModelOutputWithPastAndCrossAttentions(
Expand Down

0 comments on commit 2e5af24

Please sign in to comment.