forked from huggingface/huggingface-llama-recipes
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathqlora_405B.slurm
56 lines (50 loc) · 1.72 KB
/
qlora_405B.slurm
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
# make sure to have the lastest version of transformers bitsandbytes accelerate peft flash-attention
# use this fork for trl https://github.com/huggingface/trl/pull/1863
# training setup
NUM_NODES=1
GPUS_PER_NODE=8
WORLD_SIZE=$(($NUM_NODES*$GPUS_PER_NODE))
# so processes know who to talk to
MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1)
MASTER_PORT=6000
# go to the trl folder
cd trl
export CMD=" \
./examples/scripts/sft.py \
--model_name_or_path="meta-llama/Meta-Llama-3.1-405B" \
--dataset_name="HuggingFaceH4/no_robots" \
--report_to="wandb" \
--bf16 true \
--learning_rate=1e-05 \
--per_device_train_batch_size=1 \
--gradient_accumulation_steps=1 \
--output_dir="big-boi-llama3.1" \
--logging_steps=1 \
--eval_strategy="no" \
--num_train_epochs=3 \
--max_steps=-1 \
--gradient_checkpointing \
--optim adamw_hf \
--attn_implementation="flash_attention_2" \
--torch_dtype="bfloat16" \
--bnb_4bit_quant_type="nf4" \
--use_peft true \
--load_in_4bit \
--lora_r 16 \
--lora_alpha 32 \
--lora_target_modules q_proj k_proj v_proj o_proj \
"
export LAUNCHER="HF_HUB_ENABLE_HF_TRANSFER=1 accelerate launch \
--config_file ./examples/accelerate_configs/fspd_qlora.yaml \
--gradient_accumulation_steps 1 \
--num_machines $NUM_NODES \
--num_processes $WORLD_SIZE \
--main_process_ip $MASTER_ADDR \
--main_process_port $MASTER_PORT \
--machine_rank \$SLURM_PROCID \
--rdzv_conf "rdzv_backend=c10d,rdzv_endpoint=$MASTER_ADDR:$MASTER_PORT" \
--role \$(hostname -s): \
--tee 3 \
"
srun $SRUN_ARGS --jobid $SLURM_JOB_ID -u bash -c "$LAUNCHER --role \$SLURMD_NODENAME: $CMD"
echo "END TIME: $(date)"