Skip to content

Commit

Permalink
add workflow
Browse files Browse the repository at this point in the history
  • Loading branch information
anw90 committed Dec 12, 2024
1 parent a9848e6 commit d02a982
Show file tree
Hide file tree
Showing 8 changed files with 125 additions and 61 deletions.
25 changes: 25 additions & 0 deletions .github/workflows/accuracy_benchmark.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
name: Daily Accuracy Benchmark

on:
workflow_dispatch:
schedule:
# Runs daily at 3:00 AM, Beijing time.
- cron: '0 19 * * *' # This is UTC time

jobs:
accuracy_benchmark:
runs-on: self-hosted

steps:
- name: Checkout code
uses: actions/checkout@v3

- name: Perform the accuracy benchmark
run: cd benchmarks/accuracy && bash ./run.sh
env:
OSS_AK_ID: ${{ secrets.OSS_AK_ID }}
OSS_AK_SECRET: ${{ secrets.OSS_AK_SECRET }}
OSS_ENDPOINT: ${{ secrets.OSS_ENDPOINT }}
M6_TENANT: ${{ secrets.M6_TENANT }}
MIT_SPIDER_TOKEN: ${{ secrets.MIT_SPIDER_TOKEN }}
MIT_SPIDER_URL: ${{ secrets.MIT_SPIDER_URL }}
4 changes: 2 additions & 2 deletions benchmarks/accuracy_benchmark/fastchat.sh
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ NUM_GPUS_TOTAL=1
JUDGMENT_PARALLEL=4

function install_fastchat {
if [[ ! -d "FastChat" ]]; then
if [[ ! -d "FastChat_TorchAcc" ]]; then
git clone https://github.com/AlibabaPAI/FastChat_TorchAcc.git
fi

Expand All @@ -30,7 +30,7 @@ function install_fastchat {
else
echo "Install requirements ..."
pushd ./FastChat_TorchAcc
pip install -e ".[model_worker,llm_judge]"
pip install --use-pep517 -e ".[model_worker,llm_judge]"
pip install gradio
popd
fi
Expand Down
6 changes: 4 additions & 2 deletions benchmarks/accuracy_benchmark/llama.sh
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@ MASTER_PORT="${MASTER_PORT:-9010}"
NPROC_PER_NODE="${NPROC_PER_NODE:-8}"
BATCH_SIZE="${BATCH_SIZE:-2}"
SEQLEN="${SEQLEN:-1024}"
DATASET_NAME="${DATASET_NAME:-'wikitext'}"
DATASET_CONFIG_NAME="${DATASET_CONFIG_NAME:-'wikitext-102-raw-v1'}"
PRECISION="bf16=true"
RUN_CLM=./run_clm.py

Expand Down Expand Up @@ -49,8 +51,8 @@ torchrun --nproc_per_node "$NPROC_PER_NODE" \
--master_addr "$MASTER_ADDR" \
"$RUN_CLM" \
--num_train_epochs 2 \
--dataset_name wikitext \
--dataset_config_name wikitext-103-raw-v1 \
--dataset_name $DATASET_NAME \
--dataset_config_name $DATASET_CONFIG_NAME \
--use_fast_tokenizer false \
--per_device_train_batch_size "$BATCH_SIZE" \
--per_device_eval_batch_size "$BATCH_SIZE" \
Expand Down
142 changes: 87 additions & 55 deletions benchmarks/accuracy_benchmark/run.sh
Original file line number Diff line number Diff line change
@@ -1,12 +1,19 @@
#!/bin/bash

if [ "$#" -ne 1 ]; then
if [ "$#" -eq 1 ]; then
MODEL_DIR=$(realpath "$1")
elif [ "$#" -eq 0 ]; then
MODEL_DIR="./Llama-3.2-1B"
if [[ ! -d "$MODEL_DIR" ]]; then
MS_CKPT_URL="https://www.modelscope.cn/models/LLM-Research/Llama-3.2-1B.git"
git clone $MS_CKPT_URL
fi
else
echo "Usage: $0 <local model dir>"
echo "You must provide exactly 1 parameters."
exit 1
fi

MODEL_DIR=$(realpath "$1")

MODEL_NAME=$(basename "$MODEL_DIR")
TIMESTAMP=$(date +"%Y%m%d_%H%M%S")
RES_FOLDER="./result/$TIMESTAMP"
Expand All @@ -22,55 +29,80 @@ RES_LOG_FILE="$RES_FOLDER/result.log"
mkdir -p $RES_FOLDER


# Run the torch native job
bash ./llama.sh "$MODEL_DIR" 0 $MODEL_NAME_TORCH 2>&1 | tee $TORCH_TRAIN_LOG

# Run the torchacc job
bash ./llama.sh "$MODEL_DIR" 1 $MODEL_NAME_ACC 2>&1 | tee $ACC_TRAIN_LOG

# Evaluate original checkpoint
bash ./fastchat.sh "$MODEL_DIR" 2>&1 | tee $ORIG_MODEL_EVAL_LOG

# Evaluate Torch job
bash ./fastchat.sh "$MODEL_NAME_TORCH" 2>&1 | tee $TORCH_MODEL_EVAL_LOG

# Evaluate TorchAcc job
bash ./fastchat.sh "$MODEL_NAME_ACC" 2>&1 | tee $ACC_MODEL_EVAL_LOG

# Collect and compare the results
ORIG_SCORE=$(tail -1 $ORIG_MODEL_EVAL_LOG | awk '{print $NF}')
TORCH_SCORE=$(tail -1 $TORCH_MODEL_EVAL_LOG | awk '{print $NF}')
ACC_SCORE=$(tail -1 $ACC_MODEL_EVAL_LOG | awk '{print $NF}')

TORCH_TRAIN_LOSS=$(grep -oP "'train_loss': \K[0-9.]*" $TORCH_TRAIN_LOG)
TORCH_TRAIN_RUNTIME=$(grep -oP "'train_runtime': \K[0-9.]*" $TORCH_TRAIN_LOG)
TORCH_TRAIN_STEPS_PER_SECOND=$(grep -oP "'train_steps_per_second': \K[0-9.]*" $TORCH_TRAIN_LOG)
ACC_TRAIN_LOSS=$(grep -oP "'train_loss': \K[0-9.]*" $ACC_TRAIN_LOG)
ACC_TRAIN_RUNTIME=$(grep -oP "'train_runtime': \K[0-9.]*" $ACC_TRAIN_LOG)
ACC_TRAIN_STEPS_PER_SECOND=$(grep -oP "'train_steps_per_second': \K[0-9.]*" $ACC_TRAIN_LOG)


RESET='\033[0m'
RED='\033[31m'
GREEN='\033[32m'
YELLOW='\033[33m'
BLUE='\033[34m'
CYAN='\033[36m'

{
echo -e "\n${BLUE}==================== Training Results ====================${RESET}"
echo -e "${YELLOW}Torch train loss = ${GREEN}${TORCH_TRAIN_LOSS}${RESET}"
echo -e "${YELLOW}TorchAcc train loss = ${GREEN}${ACC_TRAIN_LOSS}${RESET}"
echo -e "${YELLOW}Torch train runtime (s) = ${GREEN}${TORCH_TRAIN_RUNTIME}${RESET}"
echo -e "${YELLOW}TorchAcc train runtime (s) = ${GREEN}${ACC_TRAIN_RUNTIME}${RESET}"
echo -e "${YELLOW}Torch train steps per second = ${GREEN}${TORCH_TRAIN_STEPS_PER_SECOND}${RESET}"
echo -e "${YELLOW}TorchAcc train steps per second = ${GREEN}${ACC_TRAIN_STEPS_PER_SECOND}${RESET}"

echo -e "\n${BLUE}=================== Evaluation Results ===================${RESET}"
echo -e "${YELLOW}Original Model Score = ${GREEN}${ORIG_SCORE}${RESET}"
echo -e "${YELLOW}Torch Model Score = ${GREEN}${TORCH_SCORE}${RESET}"
echo -e "${YELLOW}TorchAcc Model Score = ${GREEN}${ACC_SCORE}${RESET}"

echo -e "\n${CYAN}More details can be found in = ${RESET}${RES_FOLDER}"
echo -e "${BLUE}==========================================================${RESET}"
} | tee >(sed 's/\x1b\[[0-9;]*m//g' > $RES_LOG_FILE)
function do_train {
# Run the torch native job
bash ./llama.sh "$MODEL_DIR" 0 $MODEL_NAME_TORCH 2>&1 | tee $TORCH_TRAIN_LOG

# Run the torchacc job
bash ./llama.sh "$MODEL_DIR" 1 $MODEL_NAME_ACC 2>&1 | tee $ACC_TRAIN_LOG
}

function do_evaluation {
# Evaluate original checkpoint
bash ./fastchat.sh "$MODEL_DIR" 2>&1 | tee $ORIG_MODEL_EVAL_LOG

# Evaluate Torch job
bash ./fastchat.sh "$MODEL_NAME_TORCH" 2>&1 | tee $TORCH_MODEL_EVAL_LOG

# Evaluate TorchAcc job
bash ./fastchat.sh "$MODEL_NAME_ACC" 2>&1 | tee $ACC_MODEL_EVAL_LOG
}


function upload_to_oss {
if [ -n "${OSS_ENDPOINT+x}" ] && [ -n "${OSS_AK_ID+x}" ] && [ -n "${OSS_AK_ID+x}" ]; then
if ! command -v ossutil >/dev/null 2>&1; then
curl https://gosspublic.alicdn.com/ossutil/install.sh | bash
fi
ossutil config -e ${OSS_ENDPOINT} -i ${OSS_AK_ID} -k ${OSS_AK_SECRET}
ossutil cp -r -f -j 10 --exclude "*.safetensors" --exclude="*.bin" $RES_FOLDER oss://pai-devel/benchmark/accuracy/"$TIMESTAMP"
else
echo "No oss information found. Skip uploading to oss."
fi
}


function collect_and_show_results {
# Collect and compare the results
ORIG_SCORE=$(tail -1 $ORIG_MODEL_EVAL_LOG | awk '{print $NF}')
TORCH_SCORE=$(tail -1 $TORCH_MODEL_EVAL_LOG | awk '{print $NF}')
ACC_SCORE=$(tail -1 $ACC_MODEL_EVAL_LOG | awk '{print $NF}')

TORCH_TRAIN_LOSS=$(grep -oP "'train_loss': \K[0-9.]*" $TORCH_TRAIN_LOG)
TORCH_TRAIN_RUNTIME=$(grep -oP "'train_runtime': \K[0-9.]*" $TORCH_TRAIN_LOG)
TORCH_TRAIN_STEPS_PER_SECOND=$(grep -oP "'train_steps_per_second': \K[0-9.]*" $TORCH_TRAIN_LOG)
ACC_TRAIN_LOSS=$(grep -oP "'train_loss': \K[0-9.]*" $ACC_TRAIN_LOG)
ACC_TRAIN_RUNTIME=$(grep -oP "'train_runtime': \K[0-9.]*" $ACC_TRAIN_LOG)
ACC_TRAIN_STEPS_PER_SECOND=$(grep -oP "'train_steps_per_second': \K[0-9.]*" $ACC_TRAIN_LOG)


RESET='\033[0m'
RED='\033[31m'
GREEN='\033[32m'
YELLOW='\033[33m'
BLUE='\033[34m'
CYAN='\033[36m'

{
echo -e "\n${BLUE}==================== Training Results ====================${RESET}"
echo -e "${YELLOW}Torch train loss = ${GREEN}${TORCH_TRAIN_LOSS}${RESET}"
echo -e "${YELLOW}TorchAcc train loss = ${GREEN}${ACC_TRAIN_LOSS}${RESET}"
echo -e "${YELLOW}Torch train runtime (s) = ${GREEN}${TORCH_TRAIN_RUNTIME}${RESET}"
echo -e "${YELLOW}TorchAcc train runtime (s) = ${GREEN}${ACC_TRAIN_RUNTIME}${RESET}"
echo -e "${YELLOW}Torch train steps per second = ${GREEN}${TORCH_TRAIN_STEPS_PER_SECOND}${RESET}"
echo -e "${YELLOW}TorchAcc train steps per second = ${GREEN}${ACC_TRAIN_STEPS_PER_SECOND}${RESET}"

echo -e "\n${BLUE}=================== Evaluation Results ===================${RESET}"
echo -e "${YELLOW}Original Model Score = ${GREEN}${ORIG_SCORE}${RESET}"
echo -e "${YELLOW}Torch Model Score = ${GREEN}${TORCH_SCORE}${RESET}"
echo -e "${YELLOW}TorchAcc Model Score = ${GREEN}${ACC_SCORE}${RESET}"

echo -e "\n${CYAN}More details can be found in = ${RESET}${RES_FOLDER}"
echo -e "${BLUE}==========================================================${RESET}"
} | tee >(sed 's/\x1b\[[0-9;]*m//g' > $RES_LOG_FILE)
}

do_train
do_evaluation
upload_to_oss
collect_and_show_results
2 changes: 2 additions & 0 deletions benchmarks/accuracy_benchmark/run_clm.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
# yapf: disable

#!/usr/bin/env python
# coding=utf-8
# Copyright 2020 The HuggingFace Inc. team. All rights reserved.
Expand Down
2 changes: 2 additions & 0 deletions docker/Dockerfile.base
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ RUN env ${PROXY} apt-get update \
wget \
curl \
git \
git-lfs \
gcc-11 \
g++-11 \
libjpeg-dev \
Expand All @@ -30,6 +31,7 @@ RUN env ${PROXY} apt-get update \
ccache \
python3.10 \
python3.10-dev \
python3.10-lib2to3 \
&& update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-11 100 \
&& update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-11 100 \
&& update-alternatives --install /usr/bin/python python /usr/bin/python3.10 100 \
Expand Down
2 changes: 1 addition & 1 deletion docker/Dockerfile.release
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,6 @@ ENV PROXY=${use_proxy:+'https_proxy=http://127.0.0.1:7890 http_proxy=http://127.
COPY ./whls/* ${work_dir}/
RUN cd ${work_dir} \
&& env ${PROXY} pip install *.whl \
&& env ${PROXY} pip install transformers==4.33.0 datasets pillow SentencePiece accelerate transformers_stream_generator tiktoken peft bitsandbytes scipy \
&& env ${PROXY} pip install transformers==4.47.0 evaluate datasets pillow SentencePiece accelerate transformers_stream_generator tiktoken peft bitsandbytes scipy \
&& env ${PROXY} pip install torchvision==0.18.0 --no-deps \
&& rm -rf ${work_dir}/* /root/.cache/pip
3 changes: 2 additions & 1 deletion torchacc/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,10 +140,11 @@ def _set_env():

original_init = torch.autocast.__init__


def patched_init(self, device_type: str, *args, **kwargs):
if device_type == 'xla':
device_type = 'cuda'
original_init(self, device_type, *args, **kwargs)

torch.autocast.__init__ = patched_init

torch.autocast.__init__ = patched_init

0 comments on commit d02a982

Please sign in to comment.