From a9848e68bf8a4bd6bd1028166f8e715384161bb4 Mon Sep 17 00:00:00 2001 From: "wangang.wa" Date: Mon, 9 Dec 2024 11:25:20 +0800 Subject: [PATCH] Add readme --- benchmarks/accuracy_benchmark/README.md | 99 +++++++++++++++++ benchmarks/accuracy_benchmark/fastchat.patch | 102 ------------------ .../{mtbench.sh => fastchat.sh} | 17 ++- benchmarks/accuracy_benchmark/llama.sh | 11 +- benchmarks/accuracy_benchmark/llama_acc.sh | 94 ---------------- benchmarks/accuracy_benchmark/llama_ds.sh | 68 ------------ .../accuracy_benchmark/llama_fsdp_acc.json | 2 +- benchmarks/accuracy_benchmark/llama_torch.sh | 80 -------------- benchmarks/accuracy_benchmark/qwen_acc.sh | 93 ---------------- benchmarks/accuracy_benchmark/run.sh | 76 +++++++++++++ .../accuracy_benchmark/run_accuracy_bench.sh | 77 ------------- .../accuracy_benchmark/run_clm.py.acc.patch | 24 ----- .../accuracy_benchmark/run_clm.py.torch.patch | 15 --- 13 files changed, 188 insertions(+), 570 deletions(-) create mode 100644 benchmarks/accuracy_benchmark/README.md delete mode 100644 benchmarks/accuracy_benchmark/fastchat.patch rename benchmarks/accuracy_benchmark/{mtbench.sh => fastchat.sh} (75%) delete mode 100644 benchmarks/accuracy_benchmark/llama_acc.sh delete mode 100644 benchmarks/accuracy_benchmark/llama_ds.sh delete mode 100644 benchmarks/accuracy_benchmark/llama_torch.sh delete mode 100644 benchmarks/accuracy_benchmark/qwen_acc.sh create mode 100755 benchmarks/accuracy_benchmark/run.sh delete mode 100755 benchmarks/accuracy_benchmark/run_accuracy_bench.sh delete mode 100644 benchmarks/accuracy_benchmark/run_clm.py.acc.patch delete mode 100644 benchmarks/accuracy_benchmark/run_clm.py.torch.patch diff --git a/benchmarks/accuracy_benchmark/README.md b/benchmarks/accuracy_benchmark/README.md new file mode 100644 index 0000000..7397f07 --- /dev/null +++ b/benchmarks/accuracy_benchmark/README.md @@ -0,0 +1,99 @@ +# Accuracy Benchmark + +## Overview + +The Accuracy Benchmark evaluates the performance of TorchAcc using [FastChat](https://github.com/AlibabaPAI/FastChat_TorchAcc) against a baseline established by Torch native. The benchmark aims to ensure that TorchAcc maintains comparable accuracy levels with Torch native. + +## Evaluation Process + +To perform the evaluation, follow these steps: + +1. Set Baseline + + ```bash + bash ./llama.sh 0 + ``` + + Run the Torch native job using `run_clm.py`, a script copied from HuggingFace Transformers. + +2. Run TorchAcc + + ```bash + bash ./llama.sh 1 + ``` + + Run the TorchAcc job using the same script as used for Torch native. + +3. Evaluate Original + + ```bash + bash ./mtbench.sh + ``` + + Evaluate the original checkpoint using FastChat. + +4. Evaluate Outputs + + ```bash + bash ./mtbench.sh + bash ./mtbench.sh + ``` + + Evaluate the checkpoints output by Torch native job and TorchAcc. + +5. Compare Results + + Compare the training and evaluation results. + + +You can simply execute the `run.sh` script to perform all the steps. + +## Main Files + +* run.sh + + The script integrates all the steps. + + ```bash + bash ./run.sh + ``` + +* llama.sh + + The script runs llama job using `run_clm.py` with either Torch native or TorchAcc. + + ```bash + bash ./llama.sh [checkpiont_output_dir] + ``` + +* fastchat.sh + + The script runs the evaluation task on your checkpoint. + + ```bash + ENV_VARIABLES bash ./fastchat.sh + ``` + +## Evaluation Results + +The evaluation results are shown as follows: + +``` + +==================== Training Results ==================== +Torch train loss = 2.091632914827291 +TorchAcc train loss = 2.0917317353245495 +Torch train runtime (s) = 2552.8252 +TorchAcc train runtime (s) = 2272.1399 +Torch train steps per second = 5.785 +TorchAcc train steps per second = 6.5 + +=================== Evaluation Results =================== +Original Model Score = 1.4625 +Torch Model Score = 1.1125 +TorchAcc Model Score = 1.100629 + +More details can be found in = ./result/20241205_223009 +========================================================== + +``` \ No newline at end of file diff --git a/benchmarks/accuracy_benchmark/fastchat.patch b/benchmarks/accuracy_benchmark/fastchat.patch deleted file mode 100644 index a63484e..0000000 --- a/benchmarks/accuracy_benchmark/fastchat.patch +++ /dev/null @@ -1,102 +0,0 @@ -diff --git a/fastchat/llm_judge/common.py b/fastchat/llm_judge/common.py -index d2640d6..b1a4faf 100644 ---- a/fastchat/llm_judge/common.py -+++ b/fastchat/llm_judge/common.py -@@ -8,6 +8,7 @@ import glob - import json - import os - import re -+import requests - import time - from typing import Optional - -@@ -403,27 +404,59 @@ def play_a_match_pair(match: MatchPair, output_file: str): - - return result - -+def mit_spider_openai(model, temperature, max_tokens, messages): -+ MAX_API_RETRY = 3 -+ LLM_MIT_RETRY_SLEEP = 5 -+ kwargs={} -+ kwargs['model']=model -+ kwargs['temperature']=temperature -+ kwargs['max_tokens']=max_tokens -+ kwargs['messages']=messages -+ -+ if not os.environ.get('MIT_SPIDER_TOKEN', None): -+ print("NO MIT_SPIDER_TOKEN FOUND,please set export MIT_SPIDER_TOKEN=") -+ if not os.environ.get('MIT_SPIDER_URL', None): -+ print("NO MIT_SPIDER_URL FOUND,please set export MIT_SPIDER_URL=") -+ mit_spider_config = { -+ "url": os.environ.get("MIT_SPIDER_URL", None), -+ "header": { -+ "Content-Type": "application/json", -+ "Authorization": f"Bearer {os.environ.get('MIT_SPIDER_TOKEN', None)}" -+ } -+ } -+ tenant = None -+ if kwargs['model'].startswith('gpt-4') and os.environ.get("M6_TENANT", None): -+ tenant = os.environ.get("M6_TENANT") -+ response = None -+ for i in range(MAX_API_RETRY): -+ try: -+ if tenant: -+ payload = {'tenant': tenant} -+ else: -+ payload = dict() -+ for k, w in kwargs.items(): -+ payload[f"{k}"] = w -+ response = requests.post(mit_spider_config['url'], json=payload, headers=mit_spider_config['header']).json() -+ except Exception as e: -+ print(response, e) -+ time.sleep(LLM_MIT_RETRY_SLEEP) -+ continue -+ if response['code'] == 200: -+ return response -+ else: -+ time.sleep(LLM_MIT_RETRY_SLEEP) -+ print(response) -+ return None - - def chat_completion_openai(model, conv, temperature, max_tokens, api_dict=None): -- if api_dict is not None: -- openai.api_base = api_dict["api_base"] -- openai.api_key = api_dict["api_key"] -- output = API_ERROR_OUTPUT -+ messages = conv.to_openai_api_messages() - for _ in range(API_MAX_RETRY): -- try: -- messages = conv.to_openai_api_messages() -- response = openai.ChatCompletion.create( -- model=model, -- messages=messages, -- n=1, -- temperature=temperature, -- max_tokens=max_tokens, -- ) -- output = response["choices"][0]["message"]["content"] -+ output = mit_spider_openai(model, temperature, max_tokens, messages) -+ if output is not None and output['code'] == 200: - break -- except openai.error.OpenAIError as e: -- print(type(e), e) -- time.sleep(API_RETRY_SLEEP) -+ print("====catch error:", output, flush=True) -+ time.sleep(API_RETRY_SLEEP) -+ output = output['data']['response']["choices"][0]["message"]["content"] - - return output - -diff --git a/fastchat/llm_judge/gen_judgment.py b/fastchat/llm_judge/gen_judgment.py -index a1c70b2..861d9b7 100644 ---- a/fastchat/llm_judge/gen_judgment.py -+++ b/fastchat/llm_judge/gen_judgment.py -@@ -301,7 +301,7 @@ if __name__ == "__main__": - # Show match stats and prompt enter to continue - print("Stats:") - print(json.dumps(match_stat, indent=4)) -- input("Press Enter to confirm...") -+ #input("Press Enter to confirm...") - - # Play matches - if args.parallel == 1: diff --git a/benchmarks/accuracy_benchmark/mtbench.sh b/benchmarks/accuracy_benchmark/fastchat.sh similarity index 75% rename from benchmarks/accuracy_benchmark/mtbench.sh rename to benchmarks/accuracy_benchmark/fastchat.sh index 599ee5c..7ea9f52 100644 --- a/benchmarks/accuracy_benchmark/mtbench.sh +++ b/benchmarks/accuracy_benchmark/fastchat.sh @@ -1,19 +1,17 @@ #!/bin/bash -# $1: local model directory if [ "$#" -ne 1 ]; then - echo "Usage: MIT_SPIDER_TOKEN=*** MIT_SPIDER_URL=*** $0 " + echo "Usage: MIT_SPIDER_TOKEN=*** MIT_SPIDER_URL=*** M6_TENANT=*** $0 " echo "You must provide exactly 1 parameters." exit 1 fi -if [[ -z "${MIT_SPIDER_TOKEN}" ]]; then - echo "Error: Environment variable MIT_SPIDER_TOKEN is not set." >&2 - exit 1 -fi - -if [[ -z "${MIT_SPIDER_URL}" ]]; then - echo "Error: Environment variable MIT_SPIDER_URL is not set." >&2 +if [[ -z "${MIT_SPIDER_TOKEN}" || -z "${MIT_SPIDER_URL}" || -z "${M6_TENANT}" ]]; then + echo "Error: One or more required environment variables are not set." + echo "Required variables:" + [[ -z "${MIT_SPIDER_TOKEN}" ]] && echo " - MIT_SPIDER_TOKEN" + [[ -z "${MIT_SPIDER_URL}" ]] && echo " - MIT_SPIDER_URL" + [[ -z "${M6_TENANT}" ]] && echo " - M6_TENANT" exit 1 fi @@ -21,7 +19,6 @@ MODEL_DIR=$(realpath $1) MODEL_ID=$(basename "$MODEL_DIR")_$(date +"%Y%m%d_%H%M%S") NUM_GPUS_TOTAL=1 JUDGMENT_PARALLEL=4 -export M6_TENANT=M6 function install_fastchat { if [[ ! -d "FastChat" ]]; then diff --git a/benchmarks/accuracy_benchmark/llama.sh b/benchmarks/accuracy_benchmark/llama.sh index 5de8224..f4f36e3 100755 --- a/benchmarks/accuracy_benchmark/llama.sh +++ b/benchmarks/accuracy_benchmark/llama.sh @@ -15,9 +15,8 @@ WORLD_SIZE="${WORLD_SIZE:-1}" MASTER_ADDR="${MASTER_ADDR:-127.0.0.1}" MASTER_PORT="${MASTER_PORT:-9010}" NPROC_PER_NODE="${NPROC_PER_NODE:-8}" -BS="${BS:-2}" +BATCH_SIZE="${BATCH_SIZE:-2}" SEQLEN="${SEQLEN:-1024}" -TASK_TAG="${TASK_TAG:-0000}" PRECISION="bf16=true" RUN_CLM=./run_clm.py @@ -33,10 +32,10 @@ elif [ "$USE_TORCHACC" -eq 1 ]; then export ACCELERATE_USE_FSDP=true export PJRT_USE_TORCH_ALLOCATOR=true export LOW_CPU_MEM_USAGE=1 - export XLA_PERSISTENT_CACHE_PATH=./compiled_cache # uncomment this line to cache the compile results and speed up initialization. + export XLA_PERSISTENT_CACHE_PATH=./compiled_cache + FSDP_CONFIG="llama_fsdp_acc.json" TEMP_OUTPUT_DIR=$(basename "$MODEL_DIR")_acc OUTPUTS_DIR=${3:-$TEMP_OUTPUT_DIR} - FSDP_CONFIG="llama_fsdp_acc.json" else echo "The third argument must be 0 or 1" exit 1 @@ -53,8 +52,8 @@ torchrun --nproc_per_node "$NPROC_PER_NODE" \ --dataset_name wikitext \ --dataset_config_name wikitext-103-raw-v1 \ --use_fast_tokenizer false \ - --per_device_train_batch_size "$BS" \ - --per_device_eval_batch_size "$BS" \ + --per_device_train_batch_size "$BATCH_SIZE" \ + --per_device_eval_batch_size "$BATCH_SIZE" \ --do_train \ --output_dir "$OUTPUTS_DIR" \ --overwrite_output_dir \ diff --git a/benchmarks/accuracy_benchmark/llama_acc.sh b/benchmarks/accuracy_benchmark/llama_acc.sh deleted file mode 100644 index a413d07..0000000 --- a/benchmarks/accuracy_benchmark/llama_acc.sh +++ /dev/null @@ -1,94 +0,0 @@ -#!/bin/bash - -# $1: the HF transformers dir -# $2: local model directory -# $3: 0 or 1 to indicate using torchacc or not -if [ "$#" -ne 2 ]; then - echo "Usage: $0 <0 or 1 to indicate using torchacc or not>" - echo "You must provide exactly 2 parameters." - exit 1 -fi - -export PJRT_DEVICE=CUDA -export XLA_FLAGS='--xla_gpu_memory_limit_slop_factor=500 --xla_multiheap_size_constraint_per_heap=15032385536' -export ACCELERATE_USE_FSDP=true -export PJRT_USE_TORCH_ALLOCATOR=true -# export LOW_CPU_MEM_USAGE=1 -# export XLA_PERSISTENT_CACHE_PATH=./compiled_cache # uncomment this line to cache the compile results and speed up initialization. - -RANK="${RANK:-0}" -WORLD_SIZE="${WORLD_SIZE:-1}" -MASTER_ADDR="${MASTER_ADDR:-127.0.0.1}" -MASTER_PORT="${MASTER_PORT:-9010}" -NPROC_PER_NODE="${NPROC_PER_NODE:-8}" -BS="${BS:-2}" -SEQLEN="${SEQLEN:-1024}" -TASK_TAG="${TASK_TAG:-0000}" - -PRECISION="bf16=true" -JOB_NAME="LLAMA_FSDP_TORCHACC_GPU${NPROC_PER_NODE}_BS${BS}_SEQLEN${SEQLEN}_BF16" -FSDP_CONFIG="llama_fsdp_acc.json" -CLS_TO_WRAP="LlamaDecoderLayer" - -TRANSFORMERS_DIR=$(realpath "$1") -MODEL_DIR=$(realpath "$2") -OUTPUTS_DIR=$(basename "$MODEL_DIR")_acc -RUN_CLM=$TRANSFORMERS_DIR/examples/pytorch/language-modeling/run_clm.py - -# Patch the run_clm.py -PATCH_FILE=$(realpath ./run_clm.py.acc.patch) -git config --global --add safe.directory $TRANSFORMERS_DIR -pushd $TRANSFORMERS_DIR -git checkout . -patch -p1 < $PATCH_FILE -popd - -# This is the training config. You can change it as you need. -cat >"$FSDP_CONFIG" < "$FSDP_CONFIG" -{ - "train_batch_size": $((BS*8)), - "train_micro_batch_size_per_gpu": $BS, - "optimizer": { - "type": "AdamW" - }, - "zero_optimization": { - "stage": 3 - }, - "bf16": { - "enabled": true - } -} -EOF - - -echo "Running a deepspeed job ..." -export USE_TORCH_XLA=0 - -NPROC_PER_NODE=8 -PRECISION="bf16=true" -JOB_NAME="QWEN_FSDP_DEEPSPEED_GPU${NPROC_PER_NODE}_BS${BS}_SEQLEN${SEQLEN}_BF16" - - -torchrun --nproc_per_node $NPROC_PER_NODE \ - --nnodes $WORLD_SIZE \ - --node_rank $RANK \ - --master_port $MASTER_PORT \ - --master_addr $MASTER_ADDR \ - $RUN_CLM \ - --num_train_epochs 2 \ - --dataset_name wikitext \ - --dataset_config_name wikitext-103-raw-v1 \ - --use_fast_tokenizer false \ - --per_device_train_batch_size $BS \ - --per_device_eval_batch_size $BS \ - --do_train \ - --output_dir ./outputs_ds \ - --overwrite_output_dir \ - --model_name_or_path $MODEL_NAME_OR_PATH \ - --tokenizer_name $MODEL_NAME_OR_PATH \ - --trust_remote_code true \ - --cache_dir ./cache \ - --block_size $SEQLEN \ - --optim adamw_torch \ - --save_strategy no \ - --logging_strategy steps \ - --gradient_checkpointing no \ - --logging_steps 100 \ - --max_train_samples 100 \ - --$PRECISION \ - --deepspeed $FSDP_CONFIG 2>&1 | tee ./$JOB_NAME.log diff --git a/benchmarks/accuracy_benchmark/llama_fsdp_acc.json b/benchmarks/accuracy_benchmark/llama_fsdp_acc.json index 6b40864..23c4a56 100644 --- a/benchmarks/accuracy_benchmark/llama_fsdp_acc.json +++ b/benchmarks/accuracy_benchmark/llama_fsdp_acc.json @@ -10,5 +10,5 @@ "pin_layout_in_collective_ops": false, "flatten_parameters": true }, - "xla_fsdp_grad_ckpt": false + "xla_fsdp_grad_ckpt": true } diff --git a/benchmarks/accuracy_benchmark/llama_torch.sh b/benchmarks/accuracy_benchmark/llama_torch.sh deleted file mode 100644 index 9fb7b06..0000000 --- a/benchmarks/accuracy_benchmark/llama_torch.sh +++ /dev/null @@ -1,80 +0,0 @@ -#!/bin/bash - -# $1: the HF transformers dir -# $2: local model directory -if [ "$#" -ne 2 ]; then - echo "Usage: $0 " - echo "You must provide exactly 2 parameters." - exit 1 -fi - -export USE_TORCH_XLA=0 - -RANK="${RANK:-0}" -WORLD_SIZE="${WORLD_SIZE:-1}" -MASTER_ADDR="${MASTER_ADDR:-127.0.0.1}" -MASTER_PORT="${MASTER_PORT:-9010}" -NPROC_PER_NODE="${NPROC_PER_NODE:-8}" -BS="${BS:-2}" -SEQLEN="${SEQLEN:-1024}" -TASK_TAG="${TASK_TAG:-0000}" - -PRECISION="bf16=true" -JOB_NAME="LLAMA_FSDP_TORCH_GPU${NPROC_PER_NODE}_BS${BS}_SEQLEN${SEQLEN}_BF16" -FSDP_CONFIG="llama_fsdp_torch.json" -CLS_TO_WRAP="LlamaDecoderLayer" - -TRANSFORMERS_DIR=$(realpath "$1") -MODEL_DIR=$(realpath "$2") -OUTPUTS_DIR=$(basename "$MODEL_DIR")_torch -RUN_CLM=$TRANSFORMERS_DIR/examples/pytorch/language-modeling/run_clm.py - -# Patch the run_clm.py -PATCH_FILE=$(realpath ./run_clm.py.torch.patch) -git config --global --add safe.directory $TRANSFORMERS_DIR -pushd $TRANSFORMERS_DIR -git checkout . -patch -p1 < $PATCH_FILE -popd - -# This is the training config. You can change it as you need. -cat >"$FSDP_CONFIG" < " - echo "You must provide exactly 2 parameters." - exit 1 -fi - -export PJRT_DEVICE=CUDA -export XLA_FLAGS='--xla_gpu_memory_limit_slop_factor=500 --xla_multiheap_size_constraint_per_heap=15032385536' -export ACCELERATE_USE_FSDP=true -export PJRT_USE_TORCH_ALLOCATOR=true -# export LOW_CPU_MEM_USAGE=1 -# export XLA_PERSISTENT_CACHE_PATH=./compiled_cache # uncomment this line to cache the compile results and speed up initialization. - -RANK="${RANK:-0}" -WORLD_SIZE="${WORLD_SIZE:-1}" -MASTER_ADDR="${MASTER_ADDR:-127.0.0.1}" -MASTER_PORT="${MASTER_PORT:-9010}" -NPROC_PER_NODE="${NPROC_PER_NODE:-8}" -BS="${BS:-1}" -SEQLEN="${SEQLEN:-4096}" -TASK_TAG="${TASK_TAG:-0000}" - -PRECISION="bf16=true" -JOB_NAME="qwen_FSDP_TORCHACC_GPU${NPROC_PER_NODE}_BS${BS}_SEQLEN${SEQLEN}_BF16" -FSDP_CONFIG="qwen_fsdp_acc.json" -CLS_TO_WRAP="Qwen2DecoderLayer" - -TRANSFORMERS_DIR=$(realpath "$1") -MODEL_DIR=$(realpath "$2") -OUTPUTS_DIR=$(basename "$MODEL_DIR")_acc -RUN_CLM=$TRANSFORMERS_DIR/examples/pytorch/language-modeling/run_clm.py - -# Patch the run_clm.py -PATCH_FILE=$(realpath ./run_clm.py.acc.patch) -git config --global --add safe.directory $TRANSFORMERS_DIR -pushd $TRANSFORMERS_DIR -git checkout . -patch -p1 < $PATCH_FILE -popd - -# This is the training config. You can change it as you need. -cat >"$FSDP_CONFIG" <" + echo "You must provide exactly 1 parameters." + exit 1 +fi + +MODEL_DIR=$(realpath "$1") +MODEL_NAME=$(basename "$MODEL_DIR") +TIMESTAMP=$(date +"%Y%m%d_%H%M%S") +RES_FOLDER="./result/$TIMESTAMP" +MODEL_NAME_TORCH="$RES_FOLDER/torch_ckpt" +MODEL_NAME_ACC="$RES_FOLDER/acc_ckpt" +TORCH_TRAIN_LOG="$RES_FOLDER/torch_training.log" +ACC_TRAIN_LOG="$RES_FOLDER/acc_training.log" +ORIG_MODEL_EVAL_LOG="$RES_FOLDER/original_model_eval.log" +TORCH_MODEL_EVAL_LOG="$RES_FOLDER/torch_model_eval.log" +ACC_MODEL_EVAL_LOG="$RES_FOLDER/acc_model_eval.log" +RES_LOG_FILE="$RES_FOLDER/result.log" + +mkdir -p $RES_FOLDER + + +# Run the torch native job +bash ./llama.sh "$MODEL_DIR" 0 $MODEL_NAME_TORCH 2>&1 | tee $TORCH_TRAIN_LOG + +# Run the torchacc job +bash ./llama.sh "$MODEL_DIR" 1 $MODEL_NAME_ACC 2>&1 | tee $ACC_TRAIN_LOG + +# Evaluate original checkpoint +bash ./fastchat.sh "$MODEL_DIR" 2>&1 | tee $ORIG_MODEL_EVAL_LOG + +# Evaluate Torch job +bash ./fastchat.sh "$MODEL_NAME_TORCH" 2>&1 | tee $TORCH_MODEL_EVAL_LOG + +# Evaluate TorchAcc job +bash ./fastchat.sh "$MODEL_NAME_ACC" 2>&1 | tee $ACC_MODEL_EVAL_LOG + +# Collect and compare the results +ORIG_SCORE=$(tail -1 $ORIG_MODEL_EVAL_LOG | awk '{print $NF}') +TORCH_SCORE=$(tail -1 $TORCH_MODEL_EVAL_LOG | awk '{print $NF}') +ACC_SCORE=$(tail -1 $ACC_MODEL_EVAL_LOG | awk '{print $NF}') + +TORCH_TRAIN_LOSS=$(grep -oP "'train_loss': \K[0-9.]*" $TORCH_TRAIN_LOG) +TORCH_TRAIN_RUNTIME=$(grep -oP "'train_runtime': \K[0-9.]*" $TORCH_TRAIN_LOG) +TORCH_TRAIN_STEPS_PER_SECOND=$(grep -oP "'train_steps_per_second': \K[0-9.]*" $TORCH_TRAIN_LOG) +ACC_TRAIN_LOSS=$(grep -oP "'train_loss': \K[0-9.]*" $ACC_TRAIN_LOG) +ACC_TRAIN_RUNTIME=$(grep -oP "'train_runtime': \K[0-9.]*" $ACC_TRAIN_LOG) +ACC_TRAIN_STEPS_PER_SECOND=$(grep -oP "'train_steps_per_second': \K[0-9.]*" $ACC_TRAIN_LOG) + + +RESET='\033[0m' +RED='\033[31m' +GREEN='\033[32m' +YELLOW='\033[33m' +BLUE='\033[34m' +CYAN='\033[36m' + +{ + echo -e "\n${BLUE}==================== Training Results ====================${RESET}" + echo -e "${YELLOW}Torch train loss = ${GREEN}${TORCH_TRAIN_LOSS}${RESET}" + echo -e "${YELLOW}TorchAcc train loss = ${GREEN}${ACC_TRAIN_LOSS}${RESET}" + echo -e "${YELLOW}Torch train runtime (s) = ${GREEN}${TORCH_TRAIN_RUNTIME}${RESET}" + echo -e "${YELLOW}TorchAcc train runtime (s) = ${GREEN}${ACC_TRAIN_RUNTIME}${RESET}" + echo -e "${YELLOW}Torch train steps per second = ${GREEN}${TORCH_TRAIN_STEPS_PER_SECOND}${RESET}" + echo -e "${YELLOW}TorchAcc train steps per second = ${GREEN}${ACC_TRAIN_STEPS_PER_SECOND}${RESET}" + + echo -e "\n${BLUE}=================== Evaluation Results ===================${RESET}" + echo -e "${YELLOW}Original Model Score = ${GREEN}${ORIG_SCORE}${RESET}" + echo -e "${YELLOW}Torch Model Score = ${GREEN}${TORCH_SCORE}${RESET}" + echo -e "${YELLOW}TorchAcc Model Score = ${GREEN}${ACC_SCORE}${RESET}" + + echo -e "\n${CYAN}More details can be found in = ${RESET}${RES_FOLDER}" + echo -e "${BLUE}==========================================================${RESET}" +} | tee >(sed 's/\x1b\[[0-9;]*m//g' > $RES_LOG_FILE) diff --git a/benchmarks/accuracy_benchmark/run_accuracy_bench.sh b/benchmarks/accuracy_benchmark/run_accuracy_bench.sh deleted file mode 100755 index b36cbd2..0000000 --- a/benchmarks/accuracy_benchmark/run_accuracy_bench.sh +++ /dev/null @@ -1,77 +0,0 @@ -#!/bin/bash - -if [ "$#" -ne 1 ]; then - echo "Usage: $0 " - echo "You must provide exactly 1 parameters." - exit 1 -fi - -MODEL_DIR=$(realpath "$1") -MODEL_NAME=$(basename "$MODEL_DIR") -TIMESTAMP=$(date +"%Y%m%d_%H%M%S") -RES_FOLDER="./result/$TIMESTAMP" -MODEL_NAME_TORCH="$RES_FOLDER/torch_ckpt" -MODEL_NAME_ACC="$RES_FOLDER/acc_ckpt" -TORCH_TRAIN_LOG="$RES_FOLDER/torch_training.log" -ACC_TRAIN_LOG="$RES_FOLDER/acc_training.log" -ORIG_MODEL_EVAL_LOG="$RES_FOLDER/original_model_eval.log" -TORCH_MODEL_EVAL_LOG="$RES_FOLDER/torch_model_eval.log" -ACC_MODEL_EVAL_LOG="$RES_FOLDER/acc_model_eval.log" -RES_LOG_FILE="$RES_FOLDER/result.log" - -mkdir -p $RES_FOLDER - - -# Run the torch native job -bash ./llama.sh "$MODEL_DIR" 0 $MODEL_NAME_TORCH 2>&1 | tee $TORCH_TRAIN_LOG - -# Run the torchacc job -bash ./llama.sh "$MODEL_DIR" 1 $MODEL_NAME_ACC 2>&1 | tee $ACC_TRAIN_LOG - -# Evaluate original checkpoint -bash ./mtbench.sh "$MODEL_DIR" 2>&1 | tee $ORIG_MODEL_EVAL_LOG - -# Evaluate Torch job -bash ./mtbench.sh "$MODEL_NAME_TORCH" 2>&1 | tee $TORCH_MODEL_EVAL_LOG - -# Evaluate TorchAcc job -bash ./mtbench.sh "$MODEL_NAME_ACC" 2>&1 | tee $ACC_MODEL_EVAL_LOG - -# Collect and compare the results -ORIG_SCORE=$(tail -1 $ORIG_MODEL_EVAL_LOG | awk '{print $NF}') -TORCH_SCORE=$(tail -1 $TORCH_MODEL_EVAL_LOG | awk '{print $NF}') -ACC_SCORE=$(tail -1 $ACC_MODEL_EVAL_LOG | awk '{print $NF}') - -torch_train_loss=$(grep -oP 'train_loss\s*=\s*\K[0-9.]*' $TORCH_TRAIN_LOG) -torch_train_runtime=$(grep -oP 'train_runtime\s*=\s*\K[0-9:.]*' $TORCH_TRAIN_LOG) -torch_train_samples_per_second=$(grep -oP 'train_samples_per_second\s*=\s*\K[0-9.]*' $TORCH_TRAIN_LOG) -acc_train_loss=$(grep -oP 'train_loss\s*=\s*\K[0-9.]*' $ACC_TRAIN_LOG) -acc_train_runtime=$(grep -oP 'train_runtime\s*=\s*\K[0-9:.]*' $ACC_TRAIN_LOG) -acc_train_samples_per_second=$(grep -oP 'train_samples_per_second\s*=\s*\K[0-9.]*' $ACC_TRAIN_LOG) - - -RESET='\033[0m' -RED='\033[31m' -GREEN='\033[32m' -YELLOW='\033[33m' -BLUE='\033[34m' -CYAN='\033[36m' - -{ - echo -e "${BLUE}==================== Training Results ====================${RESET}" - echo -e "${YELLOW}Torch train loss = ${GREEN}${torch_train_loss}${RESET}" - echo -e "${YELLOW}TorchAcc train loss = ${GREEN}${acc_train_loss}${RESET}" - echo -e "${YELLOW}Torch train runtime = ${GREEN}${torch_train_runtime}${RESET}" - echo -e "${YELLOW}TorchAcc train runtime = ${GREEN}${acc_train_runtime}${RESET}" - echo -e "${YELLOW}Torch train samples per second = ${GREEN}${torch_train_samples_per_second}${RESET}" - echo -e "${YELLOW}TorchAcc train samples per second = ${GREEN}${acc_train_samples_per_second}${RESET}" - - - echo -e "${BLUE}==================== Evaluate Results ====================${RESET}" - echo -e "${YELLOW}Original Model Score = ${GREEN}${ORIG_SCORE}${RESET}" - echo -e "${YELLOW}Torch Model Score = ${GREEN}${TORCH_SCORE}${RESET}" - echo -e "${YELLOW}TorchAcc Model Score = ${GREEN}${ACC_SCORE}${RESET}" - - echo -e "\n${CYAN}More details can be found in = ${RESET}${RES_FOLDER}" - echo -e "${BLUE}==========================================================${RESET}" -} | tee >(sed 's/\x1b\[[0-9;]*m//g' > $RES_LOG_FILE) diff --git a/benchmarks/accuracy_benchmark/run_clm.py.acc.patch b/benchmarks/accuracy_benchmark/run_clm.py.acc.patch deleted file mode 100644 index 444981b..0000000 --- a/benchmarks/accuracy_benchmark/run_clm.py.acc.patch +++ /dev/null @@ -1,24 +0,0 @@ -diff --git a/examples/pytorch/language-modeling/run_clm.py b/examples/pytorch/language-modeling/run_clm.py -index d3f8ad8da..7e14b7d3c 100755 ---- a/examples/pytorch/language-modeling/run_clm.py -+++ b/examples/pytorch/language-modeling/run_clm.py -@@ -20,6 +20,8 @@ Here is the full list of checkpoints on the hub that can be fine-tuned by this s - https://huggingface.co/models?filter=text-generation - """ - # You can also adapt this script on your own causal language modeling task. Pointers for this are left as comments. -+import torchacc -+torchacc.utils.patch.patch_llama(1) - - import logging - import math -@@ -434,9 +436,10 @@ def main(): - trust_remote_code=model_args.trust_remote_code, - torch_dtype=torch_dtype, - low_cpu_mem_usage=model_args.low_cpu_mem_usage, -+ attn_implementation='flash_attention_2' - ) - else: -- model = AutoModelForCausalLM.from_config(config, trust_remote_code=model_args.trust_remote_code) -+ model = AutoModelForCausalLM.from_config(config, trust_remote_code=model_args.trust_remote_code, attn_implementation='flash_attention_2') - n_params = sum({p.data_ptr(): p.numel() for p in model.parameters()}.values()) - logger.info(f"Training new model from scratch - Total size={n_params/2**20:.2f}M params") diff --git a/benchmarks/accuracy_benchmark/run_clm.py.torch.patch b/benchmarks/accuracy_benchmark/run_clm.py.torch.patch deleted file mode 100644 index fd7de8c..0000000 --- a/benchmarks/accuracy_benchmark/run_clm.py.torch.patch +++ /dev/null @@ -1,15 +0,0 @@ -diff --git a/examples/pytorch/language-modeling/run_clm.py b/examples/pytorch/language-modeling/run_clm.py -index d3f8ad8da..7e14b7d3c 100755 ---- a/examples/pytorch/language-modeling/run_clm.py -+++ b/examples/pytorch/language-modeling/run_clm.py -@@ -434,9 +436,10 @@ def main(): - trust_remote_code=model_args.trust_remote_code, - torch_dtype=torch_dtype, - low_cpu_mem_usage=model_args.low_cpu_mem_usage, -+ attn_implementation='flash_attention_2' - ) - else: -- model = AutoModelForCausalLM.from_config(config, trust_remote_code=model_args.trust_remote_code) -+ model = AutoModelForCausalLM.from_config(config, trust_remote_code=model_args.trust_remote_code, attn_implementation='flash_attention_2') - n_params = sum({p.data_ptr(): p.numel() for p in model.parameters()}.values()) - logger.info(f"Training new model from scratch - Total size={n_params/2**20:.2f}M params")