From d02a9823c148f46343ab835a550772eba3394bc8 Mon Sep 17 00:00:00 2001 From: "wangang.wa" Date: Thu, 12 Dec 2024 16:28:19 +0800 Subject: [PATCH] add workflow --- .github/workflows/accuracy_benchmark.yml | 25 ++++ benchmarks/accuracy_benchmark/fastchat.sh | 4 +- benchmarks/accuracy_benchmark/llama.sh | 6 +- benchmarks/accuracy_benchmark/run.sh | 142 +++++++++++++--------- benchmarks/accuracy_benchmark/run_clm.py | 2 + docker/Dockerfile.base | 2 + docker/Dockerfile.release | 2 +- torchacc/__init__.py | 3 +- 8 files changed, 125 insertions(+), 61 deletions(-) create mode 100644 .github/workflows/accuracy_benchmark.yml diff --git a/.github/workflows/accuracy_benchmark.yml b/.github/workflows/accuracy_benchmark.yml new file mode 100644 index 0000000..16c2473 --- /dev/null +++ b/.github/workflows/accuracy_benchmark.yml @@ -0,0 +1,25 @@ +name: Daily Accuracy Benchmark + +on: + workflow_dispatch: + schedule: + # Runs daily at 3:00 AM, Beijing time. + - cron: '0 19 * * *' # This is UTC time + +jobs: + accuracy_benchmark: + runs-on: self-hosted + + steps: + - name: Checkout code + uses: actions/checkout@v3 + + - name: Perform the accuracy benchmark + run: cd benchmarks/accuracy && bash ./run.sh + env: + OSS_AK_ID: ${{ secrets.OSS_AK_ID }} + OSS_AK_SECRET: ${{ secrets.OSS_AK_SECRET }} + OSS_ENDPOINT: ${{ secrets.OSS_ENDPOINT }} + M6_TENANT: ${{ secrets.M6_TENANT }} + MIT_SPIDER_TOKEN: ${{ secrets.MIT_SPIDER_TOKEN }} + MIT_SPIDER_URL: ${{ secrets.MIT_SPIDER_URL }} diff --git a/benchmarks/accuracy_benchmark/fastchat.sh b/benchmarks/accuracy_benchmark/fastchat.sh index 7ea9f52..3d1c953 100644 --- a/benchmarks/accuracy_benchmark/fastchat.sh +++ b/benchmarks/accuracy_benchmark/fastchat.sh @@ -21,7 +21,7 @@ NUM_GPUS_TOTAL=1 JUDGMENT_PARALLEL=4 function install_fastchat { - if [[ ! -d "FastChat" ]]; then + if [[ ! -d "FastChat_TorchAcc" ]]; then git clone https://github.com/AlibabaPAI/FastChat_TorchAcc.git fi @@ -30,7 +30,7 @@ function install_fastchat { else echo "Install requirements ..." pushd ./FastChat_TorchAcc - pip install -e ".[model_worker,llm_judge]" + pip install --use-pep517 -e ".[model_worker,llm_judge]" pip install gradio popd fi diff --git a/benchmarks/accuracy_benchmark/llama.sh b/benchmarks/accuracy_benchmark/llama.sh index f4f36e3..2ef189e 100755 --- a/benchmarks/accuracy_benchmark/llama.sh +++ b/benchmarks/accuracy_benchmark/llama.sh @@ -17,6 +17,8 @@ MASTER_PORT="${MASTER_PORT:-9010}" NPROC_PER_NODE="${NPROC_PER_NODE:-8}" BATCH_SIZE="${BATCH_SIZE:-2}" SEQLEN="${SEQLEN:-1024}" +DATASET_NAME="${DATASET_NAME:-'wikitext'}" +DATASET_CONFIG_NAME="${DATASET_CONFIG_NAME:-'wikitext-102-raw-v1'}" PRECISION="bf16=true" RUN_CLM=./run_clm.py @@ -49,8 +51,8 @@ torchrun --nproc_per_node "$NPROC_PER_NODE" \ --master_addr "$MASTER_ADDR" \ "$RUN_CLM" \ --num_train_epochs 2 \ - --dataset_name wikitext \ - --dataset_config_name wikitext-103-raw-v1 \ + --dataset_name $DATASET_NAME \ + --dataset_config_name $DATASET_CONFIG_NAME \ --use_fast_tokenizer false \ --per_device_train_batch_size "$BATCH_SIZE" \ --per_device_eval_batch_size "$BATCH_SIZE" \ diff --git a/benchmarks/accuracy_benchmark/run.sh b/benchmarks/accuracy_benchmark/run.sh index f8d449f..1e32d2b 100755 --- a/benchmarks/accuracy_benchmark/run.sh +++ b/benchmarks/accuracy_benchmark/run.sh @@ -1,12 +1,19 @@ #!/bin/bash -if [ "$#" -ne 1 ]; then +if [ "$#" -eq 1 ]; then + MODEL_DIR=$(realpath "$1") +elif [ "$#" -eq 0 ]; then + MODEL_DIR="./Llama-3.2-1B" + if [[ ! -d "$MODEL_DIR" ]]; then + MS_CKPT_URL="https://www.modelscope.cn/models/LLM-Research/Llama-3.2-1B.git" + git clone $MS_CKPT_URL + fi +else echo "Usage: $0 " - echo "You must provide exactly 1 parameters." exit 1 fi -MODEL_DIR=$(realpath "$1") + MODEL_NAME=$(basename "$MODEL_DIR") TIMESTAMP=$(date +"%Y%m%d_%H%M%S") RES_FOLDER="./result/$TIMESTAMP" @@ -22,55 +29,80 @@ RES_LOG_FILE="$RES_FOLDER/result.log" mkdir -p $RES_FOLDER -# Run the torch native job -bash ./llama.sh "$MODEL_DIR" 0 $MODEL_NAME_TORCH 2>&1 | tee $TORCH_TRAIN_LOG - -# Run the torchacc job -bash ./llama.sh "$MODEL_DIR" 1 $MODEL_NAME_ACC 2>&1 | tee $ACC_TRAIN_LOG - -# Evaluate original checkpoint -bash ./fastchat.sh "$MODEL_DIR" 2>&1 | tee $ORIG_MODEL_EVAL_LOG - -# Evaluate Torch job -bash ./fastchat.sh "$MODEL_NAME_TORCH" 2>&1 | tee $TORCH_MODEL_EVAL_LOG - -# Evaluate TorchAcc job -bash ./fastchat.sh "$MODEL_NAME_ACC" 2>&1 | tee $ACC_MODEL_EVAL_LOG - -# Collect and compare the results -ORIG_SCORE=$(tail -1 $ORIG_MODEL_EVAL_LOG | awk '{print $NF}') -TORCH_SCORE=$(tail -1 $TORCH_MODEL_EVAL_LOG | awk '{print $NF}') -ACC_SCORE=$(tail -1 $ACC_MODEL_EVAL_LOG | awk '{print $NF}') - -TORCH_TRAIN_LOSS=$(grep -oP "'train_loss': \K[0-9.]*" $TORCH_TRAIN_LOG) -TORCH_TRAIN_RUNTIME=$(grep -oP "'train_runtime': \K[0-9.]*" $TORCH_TRAIN_LOG) -TORCH_TRAIN_STEPS_PER_SECOND=$(grep -oP "'train_steps_per_second': \K[0-9.]*" $TORCH_TRAIN_LOG) -ACC_TRAIN_LOSS=$(grep -oP "'train_loss': \K[0-9.]*" $ACC_TRAIN_LOG) -ACC_TRAIN_RUNTIME=$(grep -oP "'train_runtime': \K[0-9.]*" $ACC_TRAIN_LOG) -ACC_TRAIN_STEPS_PER_SECOND=$(grep -oP "'train_steps_per_second': \K[0-9.]*" $ACC_TRAIN_LOG) - - -RESET='\033[0m' -RED='\033[31m' -GREEN='\033[32m' -YELLOW='\033[33m' -BLUE='\033[34m' -CYAN='\033[36m' - -{ - echo -e "\n${BLUE}==================== Training Results ====================${RESET}" - echo -e "${YELLOW}Torch train loss = ${GREEN}${TORCH_TRAIN_LOSS}${RESET}" - echo -e "${YELLOW}TorchAcc train loss = ${GREEN}${ACC_TRAIN_LOSS}${RESET}" - echo -e "${YELLOW}Torch train runtime (s) = ${GREEN}${TORCH_TRAIN_RUNTIME}${RESET}" - echo -e "${YELLOW}TorchAcc train runtime (s) = ${GREEN}${ACC_TRAIN_RUNTIME}${RESET}" - echo -e "${YELLOW}Torch train steps per second = ${GREEN}${TORCH_TRAIN_STEPS_PER_SECOND}${RESET}" - echo -e "${YELLOW}TorchAcc train steps per second = ${GREEN}${ACC_TRAIN_STEPS_PER_SECOND}${RESET}" - - echo -e "\n${BLUE}=================== Evaluation Results ===================${RESET}" - echo -e "${YELLOW}Original Model Score = ${GREEN}${ORIG_SCORE}${RESET}" - echo -e "${YELLOW}Torch Model Score = ${GREEN}${TORCH_SCORE}${RESET}" - echo -e "${YELLOW}TorchAcc Model Score = ${GREEN}${ACC_SCORE}${RESET}" - - echo -e "\n${CYAN}More details can be found in = ${RESET}${RES_FOLDER}" - echo -e "${BLUE}==========================================================${RESET}" -} | tee >(sed 's/\x1b\[[0-9;]*m//g' > $RES_LOG_FILE) +function do_train { + # Run the torch native job + bash ./llama.sh "$MODEL_DIR" 0 $MODEL_NAME_TORCH 2>&1 | tee $TORCH_TRAIN_LOG + + # Run the torchacc job + bash ./llama.sh "$MODEL_DIR" 1 $MODEL_NAME_ACC 2>&1 | tee $ACC_TRAIN_LOG +} + +function do_evaluation { + # Evaluate original checkpoint + bash ./fastchat.sh "$MODEL_DIR" 2>&1 | tee $ORIG_MODEL_EVAL_LOG + + # Evaluate Torch job + bash ./fastchat.sh "$MODEL_NAME_TORCH" 2>&1 | tee $TORCH_MODEL_EVAL_LOG + + # Evaluate TorchAcc job + bash ./fastchat.sh "$MODEL_NAME_ACC" 2>&1 | tee $ACC_MODEL_EVAL_LOG +} + + +function upload_to_oss { + if [ -n "${OSS_ENDPOINT+x}" ] && [ -n "${OSS_AK_ID+x}" ] && [ -n "${OSS_AK_ID+x}" ]; then + if ! command -v ossutil >/dev/null 2>&1; then + curl https://gosspublic.alicdn.com/ossutil/install.sh | bash + fi + ossutil config -e ${OSS_ENDPOINT} -i ${OSS_AK_ID} -k ${OSS_AK_SECRET} + ossutil cp -r -f -j 10 --exclude "*.safetensors" --exclude="*.bin" $RES_FOLDER oss://pai-devel/benchmark/accuracy/"$TIMESTAMP" + else + echo "No oss information found. Skip uploading to oss." + fi +} + + +function collect_and_show_results { + # Collect and compare the results + ORIG_SCORE=$(tail -1 $ORIG_MODEL_EVAL_LOG | awk '{print $NF}') + TORCH_SCORE=$(tail -1 $TORCH_MODEL_EVAL_LOG | awk '{print $NF}') + ACC_SCORE=$(tail -1 $ACC_MODEL_EVAL_LOG | awk '{print $NF}') + + TORCH_TRAIN_LOSS=$(grep -oP "'train_loss': \K[0-9.]*" $TORCH_TRAIN_LOG) + TORCH_TRAIN_RUNTIME=$(grep -oP "'train_runtime': \K[0-9.]*" $TORCH_TRAIN_LOG) + TORCH_TRAIN_STEPS_PER_SECOND=$(grep -oP "'train_steps_per_second': \K[0-9.]*" $TORCH_TRAIN_LOG) + ACC_TRAIN_LOSS=$(grep -oP "'train_loss': \K[0-9.]*" $ACC_TRAIN_LOG) + ACC_TRAIN_RUNTIME=$(grep -oP "'train_runtime': \K[0-9.]*" $ACC_TRAIN_LOG) + ACC_TRAIN_STEPS_PER_SECOND=$(grep -oP "'train_steps_per_second': \K[0-9.]*" $ACC_TRAIN_LOG) + + + RESET='\033[0m' + RED='\033[31m' + GREEN='\033[32m' + YELLOW='\033[33m' + BLUE='\033[34m' + CYAN='\033[36m' + + { + echo -e "\n${BLUE}==================== Training Results ====================${RESET}" + echo -e "${YELLOW}Torch train loss = ${GREEN}${TORCH_TRAIN_LOSS}${RESET}" + echo -e "${YELLOW}TorchAcc train loss = ${GREEN}${ACC_TRAIN_LOSS}${RESET}" + echo -e "${YELLOW}Torch train runtime (s) = ${GREEN}${TORCH_TRAIN_RUNTIME}${RESET}" + echo -e "${YELLOW}TorchAcc train runtime (s) = ${GREEN}${ACC_TRAIN_RUNTIME}${RESET}" + echo -e "${YELLOW}Torch train steps per second = ${GREEN}${TORCH_TRAIN_STEPS_PER_SECOND}${RESET}" + echo -e "${YELLOW}TorchAcc train steps per second = ${GREEN}${ACC_TRAIN_STEPS_PER_SECOND}${RESET}" + + echo -e "\n${BLUE}=================== Evaluation Results ===================${RESET}" + echo -e "${YELLOW}Original Model Score = ${GREEN}${ORIG_SCORE}${RESET}" + echo -e "${YELLOW}Torch Model Score = ${GREEN}${TORCH_SCORE}${RESET}" + echo -e "${YELLOW}TorchAcc Model Score = ${GREEN}${ACC_SCORE}${RESET}" + + echo -e "\n${CYAN}More details can be found in = ${RESET}${RES_FOLDER}" + echo -e "${BLUE}==========================================================${RESET}" + } | tee >(sed 's/\x1b\[[0-9;]*m//g' > $RES_LOG_FILE) +} + +do_train +do_evaluation +upload_to_oss +collect_and_show_results diff --git a/benchmarks/accuracy_benchmark/run_clm.py b/benchmarks/accuracy_benchmark/run_clm.py index ac981d6..8b2ca86 100755 --- a/benchmarks/accuracy_benchmark/run_clm.py +++ b/benchmarks/accuracy_benchmark/run_clm.py @@ -1,3 +1,5 @@ +# yapf: disable + #!/usr/bin/env python # coding=utf-8 # Copyright 2020 The HuggingFace Inc. team. All rights reserved. diff --git a/docker/Dockerfile.base b/docker/Dockerfile.base index 607c6d2..fe80ef7 100644 --- a/docker/Dockerfile.base +++ b/docker/Dockerfile.base @@ -19,6 +19,7 @@ RUN env ${PROXY} apt-get update \ wget \ curl \ git \ + git-lfs \ gcc-11 \ g++-11 \ libjpeg-dev \ @@ -30,6 +31,7 @@ RUN env ${PROXY} apt-get update \ ccache \ python3.10 \ python3.10-dev \ + python3.10-lib2to3 \ && update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-11 100 \ && update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-11 100 \ && update-alternatives --install /usr/bin/python python /usr/bin/python3.10 100 \ diff --git a/docker/Dockerfile.release b/docker/Dockerfile.release index 3b93d26..b6eb272 100644 --- a/docker/Dockerfile.release +++ b/docker/Dockerfile.release @@ -10,6 +10,6 @@ ENV PROXY=${use_proxy:+'https_proxy=http://127.0.0.1:7890 http_proxy=http://127. COPY ./whls/* ${work_dir}/ RUN cd ${work_dir} \ && env ${PROXY} pip install *.whl \ - && env ${PROXY} pip install transformers==4.33.0 datasets pillow SentencePiece accelerate transformers_stream_generator tiktoken peft bitsandbytes scipy \ + && env ${PROXY} pip install transformers==4.47.0 evaluate datasets pillow SentencePiece accelerate transformers_stream_generator tiktoken peft bitsandbytes scipy \ && env ${PROXY} pip install torchvision==0.18.0 --no-deps \ && rm -rf ${work_dir}/* /root/.cache/pip diff --git a/torchacc/__init__.py b/torchacc/__init__.py index 704f5d5..ae5cf3d 100644 --- a/torchacc/__init__.py +++ b/torchacc/__init__.py @@ -140,10 +140,11 @@ def _set_env(): original_init = torch.autocast.__init__ + def patched_init(self, device_type: str, *args, **kwargs): if device_type == 'xla': device_type = 'cuda' original_init(self, device_type, *args, **kwargs) -torch.autocast.__init__ = patched_init +torch.autocast.__init__ = patched_init