From a9848e68bf8a4bd6bd1028166f8e715384161bb4 Mon Sep 17 00:00:00 2001
From: "wangang.wa" <wangang.wa@alibaba-inc.com>
Date: Mon, 9 Dec 2024 11:25:20 +0800
Subject: [PATCH] Add readme

---
 benchmarks/accuracy_benchmark/README.md       |  99 +++++++++++++++++
 benchmarks/accuracy_benchmark/fastchat.patch  | 102 ------------------
 .../{mtbench.sh => fastchat.sh}               |  17 ++-
 benchmarks/accuracy_benchmark/llama.sh        |  11 +-
 benchmarks/accuracy_benchmark/llama_acc.sh    |  94 ----------------
 benchmarks/accuracy_benchmark/llama_ds.sh     |  68 ------------
 .../accuracy_benchmark/llama_fsdp_acc.json    |   2 +-
 benchmarks/accuracy_benchmark/llama_torch.sh  |  80 --------------
 benchmarks/accuracy_benchmark/qwen_acc.sh     |  93 ----------------
 benchmarks/accuracy_benchmark/run.sh          |  76 +++++++++++++
 .../accuracy_benchmark/run_accuracy_bench.sh  |  77 -------------
 .../accuracy_benchmark/run_clm.py.acc.patch   |  24 -----
 .../accuracy_benchmark/run_clm.py.torch.patch |  15 ---
 13 files changed, 188 insertions(+), 570 deletions(-)
 create mode 100644 benchmarks/accuracy_benchmark/README.md
 delete mode 100644 benchmarks/accuracy_benchmark/fastchat.patch
 rename benchmarks/accuracy_benchmark/{mtbench.sh => fastchat.sh} (75%)
 delete mode 100644 benchmarks/accuracy_benchmark/llama_acc.sh
 delete mode 100644 benchmarks/accuracy_benchmark/llama_ds.sh
 delete mode 100644 benchmarks/accuracy_benchmark/llama_torch.sh
 delete mode 100644 benchmarks/accuracy_benchmark/qwen_acc.sh
 create mode 100755 benchmarks/accuracy_benchmark/run.sh
 delete mode 100755 benchmarks/accuracy_benchmark/run_accuracy_bench.sh
 delete mode 100644 benchmarks/accuracy_benchmark/run_clm.py.acc.patch
 delete mode 100644 benchmarks/accuracy_benchmark/run_clm.py.torch.patch

diff --git a/benchmarks/accuracy_benchmark/README.md b/benchmarks/accuracy_benchmark/README.md
new file mode 100644
index 0000000..7397f07
--- /dev/null
+++ b/benchmarks/accuracy_benchmark/README.md
@@ -0,0 +1,99 @@
+# Accuracy Benchmark
+
+## Overview
+
+The Accuracy Benchmark evaluates the performance of TorchAcc using [FastChat](https://github.com/AlibabaPAI/FastChat_TorchAcc) against a baseline established by Torch native. The benchmark aims to ensure that TorchAcc maintains comparable accuracy levels with Torch native.
+
+## Evaluation Process
+
+To perform the evaluation, follow these steps:
+
+1. Set Baseline
+
+    ```bash
+    bash ./llama.sh <ORIGINAL_MODEL_DIR> 0
+    ```
+
+    Run the Torch native job using `run_clm.py`, a script copied from HuggingFace Transformers.
+
+2. Run TorchAcc
+
+    ```bash
+    bash ./llama.sh <ORIGINAL_MODEL_DIR> 1
+    ```
+
+    Run the TorchAcc job using the same script as used for Torch native.
+
+3. Evaluate Original
+
+    ```bash
+    bash ./mtbench.sh <ORIGINAL_MODEL_DIR>
+    ```
+
+    Evaluate the original checkpoint using FastChat.
+
+4. Evaluate Outputs
+
+    ```bash
+    bash ./mtbench.sh <TORCH_NATIVE_CHECKPOINT>
+    bash ./mtbench.sh <TORCHACC_CHECKPOINT>
+    ```
+
+    Evaluate the checkpoints output by Torch native job and TorchAcc.
+
+5. Compare Results
+
+    Compare the training and evaluation results.
+
+
+You can simply execute the `run.sh` script to perform all the steps.
+
+## Main Files
+
+* run.sh
+
+    The script integrates all the steps.
+
+    ```bash
+    bash ./run.sh <local_model_dir>
+    ```
+
+* llama.sh
+
+    The script runs llama job using `run_clm.py` with either Torch native or TorchAcc.
+
+    ```bash
+    bash ./llama.sh <local_model_dir> <use_torchacc> [checkpiont_output_dir]
+    ```
+
+* fastchat.sh
+
+    The script runs the evaluation task on your checkpoint.
+
+    ```bash
+    ENV_VARIABLES bash ./fastchat.sh <local_model_dir>
+    ```
+
+## Evaluation Results
+
+The evaluation results are shown as follows:
+
+```
+
+==================== Training Results ====================
+Torch train loss                = 2.091632914827291
+TorchAcc train loss             = 2.0917317353245495
+Torch train runtime (s)         = 2552.8252
+TorchAcc train runtime (s)      = 2272.1399
+Torch train steps per second    = 5.785
+TorchAcc train steps per second = 6.5
+
+=================== Evaluation Results ===================
+Original Model Score            = 1.4625
+Torch Model Score               = 1.1125
+TorchAcc Model Score            = 1.100629
+
+More details can be found in    = ./result/20241205_223009
+==========================================================
+
+```
\ No newline at end of file
diff --git a/benchmarks/accuracy_benchmark/fastchat.patch b/benchmarks/accuracy_benchmark/fastchat.patch
deleted file mode 100644
index a63484e..0000000
--- a/benchmarks/accuracy_benchmark/fastchat.patch
+++ /dev/null
@@ -1,102 +0,0 @@
-diff --git a/fastchat/llm_judge/common.py b/fastchat/llm_judge/common.py
-index d2640d6..b1a4faf 100644
---- a/fastchat/llm_judge/common.py
-+++ b/fastchat/llm_judge/common.py
-@@ -8,6 +8,7 @@ import glob
- import json
- import os
- import re
-+import requests
- import time
- from typing import Optional
- 
-@@ -403,27 +404,59 @@ def play_a_match_pair(match: MatchPair, output_file: str):
- 
-     return result
- 
-+def mit_spider_openai(model, temperature, max_tokens, messages):
-+     MAX_API_RETRY = 3
-+     LLM_MIT_RETRY_SLEEP = 5
-+     kwargs={}
-+     kwargs['model']=model
-+     kwargs['temperature']=temperature
-+     kwargs['max_tokens']=max_tokens
-+     kwargs['messages']=messages
-+
-+     if not os.environ.get('MIT_SPIDER_TOKEN', None):
-+         print("NO MIT_SPIDER_TOKEN FOUND，please set export MIT_SPIDER_TOKEN=<YOUR TOKEN>")
-+     if not os.environ.get('MIT_SPIDER_URL', None):
-+         print("NO MIT_SPIDER_URL FOUND，please set export MIT_SPIDER_URL=<YOUR URL>")
-+     mit_spider_config = {
-+         "url": os.environ.get("MIT_SPIDER_URL", None),
-+         "header": {
-+             "Content-Type": "application/json",
-+             "Authorization": f"Bearer {os.environ.get('MIT_SPIDER_TOKEN', None)}"
-+         }
-+     }
-+     tenant = None
-+     if kwargs['model'].startswith('gpt-4') and os.environ.get("M6_TENANT", None):
-+         tenant = os.environ.get("M6_TENANT")
-+     response = None
-+     for i in range(MAX_API_RETRY):
-+         try:
-+             if tenant:
-+                 payload = {'tenant': tenant}
-+             else:
-+                 payload = dict()
-+             for k, w in kwargs.items():
-+                 payload[f"{k}"] = w
-+             response = requests.post(mit_spider_config['url'], json=payload, headers=mit_spider_config['header']).json()
-+         except Exception as e:
-+             print(response, e)
-+             time.sleep(LLM_MIT_RETRY_SLEEP)
-+             continue
-+         if response['code'] == 200:
-+             return response
-+         else:
-+             time.sleep(LLM_MIT_RETRY_SLEEP)
-+             print(response)
-+     return None
- 
- def chat_completion_openai(model, conv, temperature, max_tokens, api_dict=None):
--    if api_dict is not None:
--        openai.api_base = api_dict["api_base"]
--        openai.api_key = api_dict["api_key"]
--    output = API_ERROR_OUTPUT
-+    messages = conv.to_openai_api_messages()
-     for _ in range(API_MAX_RETRY):
--        try:
--            messages = conv.to_openai_api_messages()
--            response = openai.ChatCompletion.create(
--                model=model,
--                messages=messages,
--                n=1,
--                temperature=temperature,
--                max_tokens=max_tokens,
--            )
--            output = response["choices"][0]["message"]["content"]
-+        output = mit_spider_openai(model, temperature, max_tokens, messages)
-+        if output is not None and output['code'] == 200:
-             break
--        except openai.error.OpenAIError as e:
--            print(type(e), e)
--            time.sleep(API_RETRY_SLEEP)
-+        print("====catch error:", output, flush=True)
-+        time.sleep(API_RETRY_SLEEP)
-+    output = output['data']['response']["choices"][0]["message"]["content"]
- 
-     return output
- 
-diff --git a/fastchat/llm_judge/gen_judgment.py b/fastchat/llm_judge/gen_judgment.py
-index a1c70b2..861d9b7 100644
---- a/fastchat/llm_judge/gen_judgment.py
-+++ b/fastchat/llm_judge/gen_judgment.py
-@@ -301,7 +301,7 @@ if __name__ == "__main__":
-     # Show match stats and prompt enter to continue
-     print("Stats:")
-     print(json.dumps(match_stat, indent=4))
--    input("Press Enter to confirm...")
-+    #input("Press Enter to confirm...")
- 
-     # Play matches
-     if args.parallel == 1:
diff --git a/benchmarks/accuracy_benchmark/mtbench.sh b/benchmarks/accuracy_benchmark/fastchat.sh
similarity index 75%
rename from benchmarks/accuracy_benchmark/mtbench.sh
rename to benchmarks/accuracy_benchmark/fastchat.sh
index 599ee5c..7ea9f52 100644
--- a/benchmarks/accuracy_benchmark/mtbench.sh
+++ b/benchmarks/accuracy_benchmark/fastchat.sh
@@ -1,19 +1,17 @@
 #!/bin/bash
 
-# $1: local model directory
 if [ "$#" -ne 1 ]; then
-  echo "Usage: MIT_SPIDER_TOKEN=*** MIT_SPIDER_URL=*** $0 <local_model_dir>"
+  echo "Usage: MIT_SPIDER_TOKEN=*** MIT_SPIDER_URL=*** M6_TENANT=*** $0 <local_model_dir>"
   echo "You must provide exactly 1 parameters."
   exit 1
 fi
 
-if [[ -z "${MIT_SPIDER_TOKEN}" ]]; then
-  echo "Error: Environment variable MIT_SPIDER_TOKEN is not set." >&2
-  exit 1
-fi
-
-if [[ -z "${MIT_SPIDER_URL}" ]]; then
-  echo "Error: Environment variable MIT_SPIDER_URL is not set." >&2
+if [[ -z "${MIT_SPIDER_TOKEN}" || -z "${MIT_SPIDER_URL}" || -z "${M6_TENANT}" ]]; then
+  echo "Error: One or more required environment variables are not set."
+  echo "Required variables:"
+  [[ -z "${MIT_SPIDER_TOKEN}" ]] && echo "  - MIT_SPIDER_TOKEN"
+  [[ -z "${MIT_SPIDER_URL}" ]] && echo "  - MIT_SPIDER_URL"
+  [[ -z "${M6_TENANT}" ]] && echo "  - M6_TENANT"
   exit 1
 fi
 
@@ -21,7 +19,6 @@ MODEL_DIR=$(realpath $1)
 MODEL_ID=$(basename "$MODEL_DIR")_$(date +"%Y%m%d_%H%M%S")
 NUM_GPUS_TOTAL=1
 JUDGMENT_PARALLEL=4
-export M6_TENANT=M6
 
 function install_fastchat {
   if [[ ! -d "FastChat" ]]; then
diff --git a/benchmarks/accuracy_benchmark/llama.sh b/benchmarks/accuracy_benchmark/llama.sh
index 5de8224..f4f36e3 100755
--- a/benchmarks/accuracy_benchmark/llama.sh
+++ b/benchmarks/accuracy_benchmark/llama.sh
@@ -15,9 +15,8 @@ WORLD_SIZE="${WORLD_SIZE:-1}"
 MASTER_ADDR="${MASTER_ADDR:-127.0.0.1}"
 MASTER_PORT="${MASTER_PORT:-9010}"
 NPROC_PER_NODE="${NPROC_PER_NODE:-8}"
-BS="${BS:-2}"
+BATCH_SIZE="${BATCH_SIZE:-2}"
 SEQLEN="${SEQLEN:-1024}"
-TASK_TAG="${TASK_TAG:-0000}"
 PRECISION="bf16=true"
 RUN_CLM=./run_clm.py
 
@@ -33,10 +32,10 @@ elif [ "$USE_TORCHACC" -eq 1 ]; then
   export ACCELERATE_USE_FSDP=true
   export PJRT_USE_TORCH_ALLOCATOR=true
   export LOW_CPU_MEM_USAGE=1
-  export XLA_PERSISTENT_CACHE_PATH=./compiled_cache # uncomment this line to cache the compile results and speed up initialization.
+  export XLA_PERSISTENT_CACHE_PATH=./compiled_cache
+  FSDP_CONFIG="llama_fsdp_acc.json"
   TEMP_OUTPUT_DIR=$(basename "$MODEL_DIR")_acc
   OUTPUTS_DIR=${3:-$TEMP_OUTPUT_DIR}
-  FSDP_CONFIG="llama_fsdp_acc.json"
 else
   echo "The third argument must be 0 or 1"
   exit 1
@@ -53,8 +52,8 @@ torchrun --nproc_per_node "$NPROC_PER_NODE" \
   --dataset_name wikitext \
   --dataset_config_name wikitext-103-raw-v1 \
   --use_fast_tokenizer false \
-  --per_device_train_batch_size "$BS" \
-  --per_device_eval_batch_size "$BS" \
+  --per_device_train_batch_size "$BATCH_SIZE" \
+  --per_device_eval_batch_size "$BATCH_SIZE" \
   --do_train \
   --output_dir "$OUTPUTS_DIR" \
   --overwrite_output_dir \
diff --git a/benchmarks/accuracy_benchmark/llama_acc.sh b/benchmarks/accuracy_benchmark/llama_acc.sh
deleted file mode 100644
index a413d07..0000000
--- a/benchmarks/accuracy_benchmark/llama_acc.sh
+++ /dev/null
@@ -1,94 +0,0 @@
-#!/bin/bash
-
-# $1: the HF transformers dir
-# $2: local model directory
-# $3: 0 or 1 to indicate using torchacc or not
-if [ "$#" -ne 2 ]; then
-    echo "Usage: $0 <the HF transformers dir> <local model dir> <0 or 1 to indicate using torchacc or not>"
-    echo "You must provide exactly 2 parameters."
-    exit 1
-fi
-
-export PJRT_DEVICE=CUDA
-export XLA_FLAGS='--xla_gpu_memory_limit_slop_factor=500 --xla_multiheap_size_constraint_per_heap=15032385536'
-export ACCELERATE_USE_FSDP=true
-export PJRT_USE_TORCH_ALLOCATOR=true
-# export LOW_CPU_MEM_USAGE=1
-# export XLA_PERSISTENT_CACHE_PATH=./compiled_cache # uncomment this line to cache the compile results and speed up initialization.
-
-RANK="${RANK:-0}"
-WORLD_SIZE="${WORLD_SIZE:-1}"
-MASTER_ADDR="${MASTER_ADDR:-127.0.0.1}"
-MASTER_PORT="${MASTER_PORT:-9010}"
-NPROC_PER_NODE="${NPROC_PER_NODE:-8}"
-BS="${BS:-2}"
-SEQLEN="${SEQLEN:-1024}"
-TASK_TAG="${TASK_TAG:-0000}"
-
-PRECISION="bf16=true"
-JOB_NAME="LLAMA_FSDP_TORCHACC_GPU${NPROC_PER_NODE}_BS${BS}_SEQLEN${SEQLEN}_BF16"
-FSDP_CONFIG="llama_fsdp_acc.json"
-CLS_TO_WRAP="LlamaDecoderLayer"
-
-TRANSFORMERS_DIR=$(realpath "$1")
-MODEL_DIR=$(realpath "$2")
-OUTPUTS_DIR=$(basename "$MODEL_DIR")_acc
-RUN_CLM=$TRANSFORMERS_DIR/examples/pytorch/language-modeling/run_clm.py
-
-# Patch the run_clm.py
-PATCH_FILE=$(realpath ./run_clm.py.acc.patch)
-git config --global --add safe.directory $TRANSFORMERS_DIR
-pushd $TRANSFORMERS_DIR
-git checkout .
-patch -p1 < $PATCH_FILE
-popd
-
-# This is the training config. You can change it as you need.
-cat >"$FSDP_CONFIG" <<EOF
-{
-    "fsdp_transformer_layer_cls_to_wrap": [
-        "${CLS_TO_WRAP}"
-    ],
-    "xla": true,
-    "xla_fsdp_settings": {
-        "compute_dtype": "bfloat16",
-        "buffer_dtype": "bfloat16",
-        "opt_flatten_overlap": true,
-        "pin_layout_in_collective_ops": false,
-        "flatten_parameters": true
-    },
-    "xla_fsdp_grad_ckpt": false
-}
-EOF
-
-# Launch the job
-torchrun --nproc_per_node "$NPROC_PER_NODE" \
-    --nnodes "$WORLD_SIZE" \
-    --node_rank "$RANK" \
-    --master_port "$MASTER_PORT" \
-    --master_addr "$MASTER_ADDR" \
-    "$RUN_CLM" \
-    --num_train_epochs 2 \
-    --dataset_name wikitext \
-    --dataset_config_name wikitext-103-raw-v1 \
-    --use_fast_tokenizer false \
-    --per_device_train_batch_size "$BS" \
-    --per_device_eval_batch_size "$BS" \
-    --do_train \
-    --output_dir "$OUTPUTS_DIR" \
-    --overwrite_output_dir \
-    --model_name_or_path "$MODEL_DIR" \
-    --tokenizer_name "$MODEL_DIR" \
-    --trust_remote_code true \
-    --low_cpu_mem_usage true \
-    --cache_dir ./cache \
-    --block_size "$SEQLEN" \
-    --optim adamw_torch \
-    --save_strategy no \
-    --logging_strategy steps \
-    --gradient_checkpointing no \
-    --logging_steps 100 \
-    --max_train_samples 1000 \
-    --"$PRECISION" \
-    --fsdp "auto_wrap" \
-    --fsdp_config "$FSDP_CONFIG"
diff --git a/benchmarks/accuracy_benchmark/llama_ds.sh b/benchmarks/accuracy_benchmark/llama_ds.sh
deleted file mode 100644
index 4d233c1..0000000
--- a/benchmarks/accuracy_benchmark/llama_ds.sh
+++ /dev/null
@@ -1,68 +0,0 @@
-set -ex
-
-[ -z "$RANK" ] && RANK=0
-[ -z "$WORLD_SIZE" ] && WORLD_SIZE=1
-[ -z "$MASTER_ADDR" ] && MASTER_ADDR=127.0.0.1
-[ -z "$MASTER_PORT" ] && MASTER_PORT=9010
-[ -z "$TASK_TAG" ] && TASK_TAG=0000
-[ -z "$BS" ] && BS=4
-[ -z "$SEQLEN" ] && SEQLEN=4096
-
-RUN_CLM=/home/wangang.wa/transformers/examples/pytorch/language-modeling/run_clm.py
-MODEL_NAME_OR_PATH=/home/wangang.wa/open_source_models/model_scope_models/Llama-3.2-1B-Instruct/
-
-# This is the training config. You can change it as you need.
-FSDP_CONFIG="llama_fsdp_ds.json"
-cat <<EOF > "$FSDP_CONFIG"
-{
-    "train_batch_size": $((BS*8)),
-    "train_micro_batch_size_per_gpu": $BS,
-    "optimizer": {
-        "type": "AdamW"
-    },
-    "zero_optimization": {
-        "stage": 3
-    },
-    "bf16": {
-        "enabled": true
-    }
-}
-EOF
-
-
-echo "Running a deepspeed job ..."
-export USE_TORCH_XLA=0
-
-NPROC_PER_NODE=8
-PRECISION="bf16=true"
-JOB_NAME="QWEN_FSDP_DEEPSPEED_GPU${NPROC_PER_NODE}_BS${BS}_SEQLEN${SEQLEN}_BF16"
-
-
-torchrun --nproc_per_node $NPROC_PER_NODE \
-    --nnodes $WORLD_SIZE \
-    --node_rank $RANK \
-    --master_port $MASTER_PORT \
-    --master_addr $MASTER_ADDR \
-    $RUN_CLM \
-    --num_train_epochs 2 \
-    --dataset_name wikitext \
-    --dataset_config_name wikitext-103-raw-v1 \
-    --use_fast_tokenizer false \
-    --per_device_train_batch_size $BS \
-    --per_device_eval_batch_size $BS \
-    --do_train \
-    --output_dir ./outputs_ds \
-    --overwrite_output_dir \
-    --model_name_or_path $MODEL_NAME_OR_PATH \
-    --tokenizer_name $MODEL_NAME_OR_PATH \
-    --trust_remote_code true \
-    --cache_dir ./cache \
-    --block_size $SEQLEN \
-    --optim adamw_torch \
-    --save_strategy no \
-    --logging_strategy steps \
-    --gradient_checkpointing no \
-    --logging_steps 100 \
-    --max_train_samples 100 \
-    --$PRECISION \
-    --deepspeed $FSDP_CONFIG 2>&1 | tee ./$JOB_NAME.log
diff --git a/benchmarks/accuracy_benchmark/llama_fsdp_acc.json b/benchmarks/accuracy_benchmark/llama_fsdp_acc.json
index 6b40864..23c4a56 100644
--- a/benchmarks/accuracy_benchmark/llama_fsdp_acc.json
+++ b/benchmarks/accuracy_benchmark/llama_fsdp_acc.json
@@ -10,5 +10,5 @@
         "pin_layout_in_collective_ops": false,
         "flatten_parameters": true
     },
-    "xla_fsdp_grad_ckpt": false
+    "xla_fsdp_grad_ckpt": true
 }
diff --git a/benchmarks/accuracy_benchmark/llama_torch.sh b/benchmarks/accuracy_benchmark/llama_torch.sh
deleted file mode 100644
index 9fb7b06..0000000
--- a/benchmarks/accuracy_benchmark/llama_torch.sh
+++ /dev/null
@@ -1,80 +0,0 @@
-#!/bin/bash
-
-# $1: the HF transformers dir
-# $2: local model directory
-if [ "$#" -ne 2 ]; then
-    echo "Usage: $0 <the HF transformers dir> <local model dir>"
-    echo "You must provide exactly 2 parameters."
-    exit 1
-fi
-
-export USE_TORCH_XLA=0
-
-RANK="${RANK:-0}"
-WORLD_SIZE="${WORLD_SIZE:-1}"
-MASTER_ADDR="${MASTER_ADDR:-127.0.0.1}"
-MASTER_PORT="${MASTER_PORT:-9010}"
-NPROC_PER_NODE="${NPROC_PER_NODE:-8}"
-BS="${BS:-2}"
-SEQLEN="${SEQLEN:-1024}"
-TASK_TAG="${TASK_TAG:-0000}"
-
-PRECISION="bf16=true"
-JOB_NAME="LLAMA_FSDP_TORCH_GPU${NPROC_PER_NODE}_BS${BS}_SEQLEN${SEQLEN}_BF16"
-FSDP_CONFIG="llama_fsdp_torch.json"
-CLS_TO_WRAP="LlamaDecoderLayer"
-
-TRANSFORMERS_DIR=$(realpath "$1")
-MODEL_DIR=$(realpath "$2")
-OUTPUTS_DIR=$(basename "$MODEL_DIR")_torch
-RUN_CLM=$TRANSFORMERS_DIR/examples/pytorch/language-modeling/run_clm.py
-
-# Patch the run_clm.py
-PATCH_FILE=$(realpath ./run_clm.py.torch.patch)
-git config --global --add safe.directory $TRANSFORMERS_DIR
-pushd $TRANSFORMERS_DIR
-git checkout .
-patch -p1 < $PATCH_FILE
-popd
-
-# This is the training config. You can change it as you need.
-cat >"$FSDP_CONFIG" <<EOF
-{
-    "fsdp_transformer_layer_cls_to_wrap": [
-        "${CLS_TO_WRAP}"
-    ],
-    "activation_checkpointing": false
-}
-EOF
-
-# Launch the job
-torchrun --nproc_per_node "$NPROC_PER_NODE" \
-    --nnodes "$WORLD_SIZE" \
-    --node_rank "$RANK" \
-    --master_port "$MASTER_PORT" \
-    --master_addr "$MASTER_ADDR" \
-    "$RUN_CLM" \
-    --num_train_epochs 2 \
-    --dataset_name wikitext \
-    --dataset_config_name wikitext-103-raw-v1 \
-    --use_fast_tokenizer false \
-    --per_device_train_batch_size "$BS" \
-    --per_device_eval_batch_size "$BS" \
-    --do_train \
-    --output_dir "$OUTPUTS_DIR" \
-    --overwrite_output_dir \
-    --model_name_or_path "$MODEL_DIR" \
-    --tokenizer_name "$MODEL_DIR" \
-    --trust_remote_code true \
-    --low_cpu_mem_usage true \
-    --cache_dir ./cache \
-    --block_size "$SEQLEN" \
-    --optim adamw_torch \
-    --save_strategy no \
-    --logging_strategy steps \
-    --gradient_checkpointing no \
-    --logging_steps 100 \
-    --max_train_samples 1000 \
-    --"$PRECISION" \
-    --fsdp "auto_wrap" \
-    --fsdp_config "$FSDP_CONFIG"
diff --git a/benchmarks/accuracy_benchmark/qwen_acc.sh b/benchmarks/accuracy_benchmark/qwen_acc.sh
deleted file mode 100644
index 8252de5..0000000
--- a/benchmarks/accuracy_benchmark/qwen_acc.sh
+++ /dev/null
@@ -1,93 +0,0 @@
-#!/bin/bash
-
-# $1: the HF transformers dir
-# $2: local model directory
-if [ "$#" -ne 2 ]; then
-    echo "Usage: $0 <the HF transformers dir> <local model dir>"
-    echo "You must provide exactly 2 parameters."
-    exit 1
-fi
-
-export PJRT_DEVICE=CUDA
-export XLA_FLAGS='--xla_gpu_memory_limit_slop_factor=500 --xla_multiheap_size_constraint_per_heap=15032385536'
-export ACCELERATE_USE_FSDP=true
-export PJRT_USE_TORCH_ALLOCATOR=true
-# export LOW_CPU_MEM_USAGE=1
-# export XLA_PERSISTENT_CACHE_PATH=./compiled_cache # uncomment this line to cache the compile results and speed up initialization.
-
-RANK="${RANK:-0}"
-WORLD_SIZE="${WORLD_SIZE:-1}"
-MASTER_ADDR="${MASTER_ADDR:-127.0.0.1}"
-MASTER_PORT="${MASTER_PORT:-9010}"
-NPROC_PER_NODE="${NPROC_PER_NODE:-8}"
-BS="${BS:-1}"
-SEQLEN="${SEQLEN:-4096}"
-TASK_TAG="${TASK_TAG:-0000}"
-
-PRECISION="bf16=true"
-JOB_NAME="qwen_FSDP_TORCHACC_GPU${NPROC_PER_NODE}_BS${BS}_SEQLEN${SEQLEN}_BF16"
-FSDP_CONFIG="qwen_fsdp_acc.json"
-CLS_TO_WRAP="Qwen2DecoderLayer"
-
-TRANSFORMERS_DIR=$(realpath "$1")
-MODEL_DIR=$(realpath "$2")
-OUTPUTS_DIR=$(basename "$MODEL_DIR")_acc
-RUN_CLM=$TRANSFORMERS_DIR/examples/pytorch/language-modeling/run_clm.py
-
-# Patch the run_clm.py
-PATCH_FILE=$(realpath ./run_clm.py.acc.patch)
-git config --global --add safe.directory $TRANSFORMERS_DIR
-pushd $TRANSFORMERS_DIR
-git checkout .
-patch -p1 < $PATCH_FILE
-popd
-
-# This is the training config. You can change it as you need.
-cat >"$FSDP_CONFIG" <<EOF
-{
-    "fsdp_transformer_layer_cls_to_wrap": [
-        "${CLS_TO_WRAP}"
-    ],
-    "xla": true,
-    "xla_fsdp_settings": {
-        "compute_dtype": "bfloat16",
-        "buffer_dtype": "bfloat16",
-        "opt_flatten_overlap": true,
-        "pin_layout_in_collective_ops": false,
-        "flatten_parameters": true
-    },
-    "xla_fsdp_grad_ckpt": true
-}
-EOF
-
-# Launch the job
-torchrun --nproc_per_node "$NPROC_PER_NODE" \
-    --nnodes "$WORLD_SIZE" \
-    --node_rank "$RANK" \
-    --master_port "$MASTER_PORT" \
-    --master_addr "$MASTER_ADDR" \
-    "$RUN_CLM" \
-    --num_train_epochs 1 \
-    --dataset_name Salesforce/wikitext \
-    --dataset_config_name wikitext-2-raw-v1 \
-    --use_fast_tokenizer false \
-    --per_device_train_batch_size "$BS" \
-    --per_device_eval_batch_size "$BS" \
-    --do_train \
-    --output_dir "$OUTPUTS_DIR" \
-    --overwrite_output_dir \
-    --model_name_or_path "$MODEL_DIR" \
-    --tokenizer_name "$MODEL_DIR" \
-    --trust_remote_code true \
-    --low_cpu_mem_usage true \
-    --cache_dir ./cache \
-    --block_size "$SEQLEN" \
-    --optim adamw_torch \
-    --save_strategy no \
-    --logging_strategy steps \
-    --gradient_checkpointing no \
-    --logging_steps 100 \
-    --max_train_samples 100 \
-    --"$PRECISION" \
-    --fsdp "auto_wrap" \
-    --fsdp_config "$FSDP_CONFIG"
diff --git a/benchmarks/accuracy_benchmark/run.sh b/benchmarks/accuracy_benchmark/run.sh
new file mode 100755
index 0000000..f8d449f
--- /dev/null
+++ b/benchmarks/accuracy_benchmark/run.sh
@@ -0,0 +1,76 @@
+#!/bin/bash
+
+if [ "$#" -ne 1 ]; then
+  echo "Usage: $0 <local model dir>"
+  echo "You must provide exactly 1 parameters."
+  exit 1
+fi
+
+MODEL_DIR=$(realpath "$1")
+MODEL_NAME=$(basename "$MODEL_DIR")
+TIMESTAMP=$(date +"%Y%m%d_%H%M%S")
+RES_FOLDER="./result/$TIMESTAMP"
+MODEL_NAME_TORCH="$RES_FOLDER/torch_ckpt"
+MODEL_NAME_ACC="$RES_FOLDER/acc_ckpt"
+TORCH_TRAIN_LOG="$RES_FOLDER/torch_training.log"
+ACC_TRAIN_LOG="$RES_FOLDER/acc_training.log"
+ORIG_MODEL_EVAL_LOG="$RES_FOLDER/original_model_eval.log"
+TORCH_MODEL_EVAL_LOG="$RES_FOLDER/torch_model_eval.log"
+ACC_MODEL_EVAL_LOG="$RES_FOLDER/acc_model_eval.log"
+RES_LOG_FILE="$RES_FOLDER/result.log"
+
+mkdir -p $RES_FOLDER
+
+
+# Run the torch native job
+bash ./llama.sh "$MODEL_DIR" 0 $MODEL_NAME_TORCH 2>&1 | tee $TORCH_TRAIN_LOG
+
+# Run the torchacc job
+bash ./llama.sh "$MODEL_DIR" 1 $MODEL_NAME_ACC 2>&1 | tee $ACC_TRAIN_LOG
+
+# Evaluate original checkpoint
+bash ./fastchat.sh "$MODEL_DIR" 2>&1 | tee $ORIG_MODEL_EVAL_LOG
+
+# Evaluate Torch job
+bash ./fastchat.sh "$MODEL_NAME_TORCH" 2>&1 | tee $TORCH_MODEL_EVAL_LOG
+
+# Evaluate TorchAcc job
+bash ./fastchat.sh "$MODEL_NAME_ACC" 2>&1 | tee $ACC_MODEL_EVAL_LOG
+
+# Collect and compare the results
+ORIG_SCORE=$(tail -1 $ORIG_MODEL_EVAL_LOG | awk '{print $NF}')
+TORCH_SCORE=$(tail -1 $TORCH_MODEL_EVAL_LOG | awk '{print $NF}')
+ACC_SCORE=$(tail -1 $ACC_MODEL_EVAL_LOG | awk '{print $NF}')
+
+TORCH_TRAIN_LOSS=$(grep -oP "'train_loss': \K[0-9.]*" $TORCH_TRAIN_LOG)
+TORCH_TRAIN_RUNTIME=$(grep -oP "'train_runtime': \K[0-9.]*" $TORCH_TRAIN_LOG)
+TORCH_TRAIN_STEPS_PER_SECOND=$(grep -oP "'train_steps_per_second': \K[0-9.]*" $TORCH_TRAIN_LOG)
+ACC_TRAIN_LOSS=$(grep -oP "'train_loss': \K[0-9.]*" $ACC_TRAIN_LOG)
+ACC_TRAIN_RUNTIME=$(grep -oP "'train_runtime': \K[0-9.]*" $ACC_TRAIN_LOG)
+ACC_TRAIN_STEPS_PER_SECOND=$(grep -oP "'train_steps_per_second': \K[0-9.]*" $ACC_TRAIN_LOG)
+
+
+RESET='\033[0m'
+RED='\033[31m'
+GREEN='\033[32m'
+YELLOW='\033[33m'
+BLUE='\033[34m'
+CYAN='\033[36m'
+
+{
+  echo -e "\n${BLUE}==================== Training Results ====================${RESET}"
+  echo -e "${YELLOW}Torch    train loss                = ${GREEN}${TORCH_TRAIN_LOSS}${RESET}"
+  echo -e "${YELLOW}TorchAcc train loss             = ${GREEN}${ACC_TRAIN_LOSS}${RESET}"
+  echo -e "${YELLOW}Torch    train runtime (s)      = ${GREEN}${TORCH_TRAIN_RUNTIME}${RESET}"
+  echo -e "${YELLOW}TorchAcc train runtime (s)      = ${GREEN}${ACC_TRAIN_RUNTIME}${RESET}"
+  echo -e "${YELLOW}Torch    train steps per second = ${GREEN}${TORCH_TRAIN_STEPS_PER_SECOND}${RESET}"
+  echo -e "${YELLOW}TorchAcc train steps per second = ${GREEN}${ACC_TRAIN_STEPS_PER_SECOND}${RESET}"
+
+  echo -e "\n${BLUE}=================== Evaluation Results ===================${RESET}"
+  echo -e "${YELLOW}Original Model Score            = ${GREEN}${ORIG_SCORE}${RESET}"
+  echo -e "${YELLOW}Torch    Model Score            = ${GREEN}${TORCH_SCORE}${RESET}"
+  echo -e "${YELLOW}TorchAcc Model Score            = ${GREEN}${ACC_SCORE}${RESET}"
+
+  echo -e "\n${CYAN}More details can be found in    = ${RESET}${RES_FOLDER}"
+  echo -e "${BLUE}==========================================================${RESET}"
+} | tee >(sed 's/\x1b\[[0-9;]*m//g' > $RES_LOG_FILE)
diff --git a/benchmarks/accuracy_benchmark/run_accuracy_bench.sh b/benchmarks/accuracy_benchmark/run_accuracy_bench.sh
deleted file mode 100755
index b36cbd2..0000000
--- a/benchmarks/accuracy_benchmark/run_accuracy_bench.sh
+++ /dev/null
@@ -1,77 +0,0 @@
-#!/bin/bash
-
-if [ "$#" -ne 1 ]; then
-  echo "Usage: $0 <local model dir>"
-  echo "You must provide exactly 1 parameters."
-  exit 1
-fi
-
-MODEL_DIR=$(realpath "$1")
-MODEL_NAME=$(basename "$MODEL_DIR")
-TIMESTAMP=$(date +"%Y%m%d_%H%M%S")
-RES_FOLDER="./result/$TIMESTAMP"
-MODEL_NAME_TORCH="$RES_FOLDER/torch_ckpt"
-MODEL_NAME_ACC="$RES_FOLDER/acc_ckpt"
-TORCH_TRAIN_LOG="$RES_FOLDER/torch_training.log"
-ACC_TRAIN_LOG="$RES_FOLDER/acc_training.log"
-ORIG_MODEL_EVAL_LOG="$RES_FOLDER/original_model_eval.log"
-TORCH_MODEL_EVAL_LOG="$RES_FOLDER/torch_model_eval.log"
-ACC_MODEL_EVAL_LOG="$RES_FOLDER/acc_model_eval.log"
-RES_LOG_FILE="$RES_FOLDER/result.log"
-
-mkdir -p $RES_FOLDER
-
-
-# Run the torch native job
-bash ./llama.sh "$MODEL_DIR" 0 $MODEL_NAME_TORCH 2>&1 | tee $TORCH_TRAIN_LOG
-
-# Run the torchacc job
-bash ./llama.sh "$MODEL_DIR" 1 $MODEL_NAME_ACC 2>&1 | tee $ACC_TRAIN_LOG
-
-# Evaluate original checkpoint
-bash ./mtbench.sh "$MODEL_DIR" 2>&1 | tee $ORIG_MODEL_EVAL_LOG
-
-# Evaluate Torch job
-bash ./mtbench.sh "$MODEL_NAME_TORCH" 2>&1 | tee $TORCH_MODEL_EVAL_LOG
-
-# Evaluate TorchAcc job
-bash ./mtbench.sh "$MODEL_NAME_ACC" 2>&1 | tee $ACC_MODEL_EVAL_LOG
-
-# Collect and compare the results
-ORIG_SCORE=$(tail -1 $ORIG_MODEL_EVAL_LOG | awk '{print $NF}')
-TORCH_SCORE=$(tail -1 $TORCH_MODEL_EVAL_LOG | awk '{print $NF}')
-ACC_SCORE=$(tail -1 $ACC_MODEL_EVAL_LOG | awk '{print $NF}')
-
-torch_train_loss=$(grep -oP 'train_loss\s*=\s*\K[0-9.]*' $TORCH_TRAIN_LOG)
-torch_train_runtime=$(grep -oP 'train_runtime\s*=\s*\K[0-9:.]*' $TORCH_TRAIN_LOG)
-torch_train_samples_per_second=$(grep -oP 'train_samples_per_second\s*=\s*\K[0-9.]*' $TORCH_TRAIN_LOG)
-acc_train_loss=$(grep -oP 'train_loss\s*=\s*\K[0-9.]*' $ACC_TRAIN_LOG)
-acc_train_runtime=$(grep -oP 'train_runtime\s*=\s*\K[0-9:.]*' $ACC_TRAIN_LOG)
-acc_train_samples_per_second=$(grep -oP 'train_samples_per_second\s*=\s*\K[0-9.]*' $ACC_TRAIN_LOG)
-
-
-RESET='\033[0m'
-RED='\033[31m'
-GREEN='\033[32m'
-YELLOW='\033[33m'
-BLUE='\033[34m'
-CYAN='\033[36m'
-
-{
-  echo -e "${BLUE}==================== Training Results ====================${RESET}"
-  echo -e "${YELLOW}Torch train loss                  = ${GREEN}${torch_train_loss}${RESET}"
-  echo -e "${YELLOW}TorchAcc train loss               = ${GREEN}${acc_train_loss}${RESET}"
-  echo -e "${YELLOW}Torch train runtime               = ${GREEN}${torch_train_runtime}${RESET}"
-  echo -e "${YELLOW}TorchAcc train runtime            = ${GREEN}${acc_train_runtime}${RESET}"
-  echo -e "${YELLOW}Torch train samples per second    = ${GREEN}${torch_train_samples_per_second}${RESET}"
-  echo -e "${YELLOW}TorchAcc train samples per second = ${GREEN}${acc_train_samples_per_second}${RESET}"
-
-
-  echo -e "${BLUE}==================== Evaluate Results ====================${RESET}"
-  echo -e "${YELLOW}Original Model Score         = ${GREEN}${ORIG_SCORE}${RESET}"
-  echo -e "${YELLOW}Torch Model Score            = ${GREEN}${TORCH_SCORE}${RESET}"
-  echo -e "${YELLOW}TorchAcc Model Score         = ${GREEN}${ACC_SCORE}${RESET}"
-
-  echo -e "\n${CYAN}More details can be found in = ${RESET}${RES_FOLDER}"
-  echo -e "${BLUE}==========================================================${RESET}"
-} | tee >(sed 's/\x1b\[[0-9;]*m//g' > $RES_LOG_FILE)
diff --git a/benchmarks/accuracy_benchmark/run_clm.py.acc.patch b/benchmarks/accuracy_benchmark/run_clm.py.acc.patch
deleted file mode 100644
index 444981b..0000000
--- a/benchmarks/accuracy_benchmark/run_clm.py.acc.patch
+++ /dev/null
@@ -1,24 +0,0 @@
-diff --git a/examples/pytorch/language-modeling/run_clm.py b/examples/pytorch/language-modeling/run_clm.py
-index d3f8ad8da..7e14b7d3c 100755
---- a/examples/pytorch/language-modeling/run_clm.py
-+++ b/examples/pytorch/language-modeling/run_clm.py
-@@ -20,6 +20,8 @@ Here is the full list of checkpoints on the hub that can be fine-tuned by this s
- https://huggingface.co/models?filter=text-generation
- """
- # You can also adapt this script on your own causal language modeling task. Pointers for this are left as comments.
-+import torchacc
-+torchacc.utils.patch.patch_llama(1)
-
- import logging
- import math
-@@ -434,9 +436,10 @@ def main():
-             trust_remote_code=model_args.trust_remote_code,
-             torch_dtype=torch_dtype,
-             low_cpu_mem_usage=model_args.low_cpu_mem_usage,
-+            attn_implementation='flash_attention_2'
-         )
-     else:
--        model = AutoModelForCausalLM.from_config(config, trust_remote_code=model_args.trust_remote_code)
-+        model = AutoModelForCausalLM.from_config(config, trust_remote_code=model_args.trust_remote_code, attn_implementation='flash_attention_2')
-         n_params = sum({p.data_ptr(): p.numel() for p in model.parameters()}.values())
-         logger.info(f"Training new model from scratch - Total size={n_params/2**20:.2f}M params")
diff --git a/benchmarks/accuracy_benchmark/run_clm.py.torch.patch b/benchmarks/accuracy_benchmark/run_clm.py.torch.patch
deleted file mode 100644
index fd7de8c..0000000
--- a/benchmarks/accuracy_benchmark/run_clm.py.torch.patch
+++ /dev/null
@@ -1,15 +0,0 @@
-diff --git a/examples/pytorch/language-modeling/run_clm.py b/examples/pytorch/language-modeling/run_clm.py
-index d3f8ad8da..7e14b7d3c 100755
---- a/examples/pytorch/language-modeling/run_clm.py
-+++ b/examples/pytorch/language-modeling/run_clm.py
-@@ -434,9 +436,10 @@ def main():
-             trust_remote_code=model_args.trust_remote_code,
-             torch_dtype=torch_dtype,
-             low_cpu_mem_usage=model_args.low_cpu_mem_usage,
-+            attn_implementation='flash_attention_2'
-         )
-     else:
--        model = AutoModelForCausalLM.from_config(config, trust_remote_code=model_args.trust_remote_code)
-+        model = AutoModelForCausalLM.from_config(config, trust_remote_code=model_args.trust_remote_code, attn_implementation='flash_attention_2')
-         n_params = sum({p.data_ptr(): p.numel() for p in model.parameters()}.values())
-         logger.info(f"Training new model from scratch - Total size={n_params/2**20:.2f}M params")