Skip to content

Commit

Permalink
Add num checks
Browse files Browse the repository at this point in the history
  • Loading branch information
anw90 committed Dec 24, 2024
1 parent 0f5345e commit 7170d37
Show file tree
Hide file tree
Showing 3 changed files with 36 additions and 10 deletions.
6 changes: 3 additions & 3 deletions .github/workflows/accuracy_benchmark.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,8 @@ name: Daily Accuracy Benchmark
on:
workflow_dispatch:
schedule:
# Runs daily at 3:00 AM, Beijing time.
- cron: '0 19 * * *' # This is UTC time
# Runs daily at 2:00 AM, Beijing time.
- cron: '0 18 * * *' # This is UTC time

jobs:
accuracy_benchmark:
Expand All @@ -20,7 +20,7 @@ jobs:
docker run -v $PWD:$PWD -w $PWD --net host --ipc host --shm-size 80G -t --rm --gpus all $UT_IMAGE bash -c '
git config --global --add safe.directory $PWD && \
pip install -e . && \
cd benchmarks/accuracy && bash ./run.sh'
cd benchmarks/accuracy && NPROC_PER_NODE=4 bash ./run.sh'
env:
UT_IMAGE: ${{ secrets.UT_IMAGE }}
OSS_AK_ID: ${{ secrets.OSS_AK_ID }}
Expand Down
5 changes: 2 additions & 3 deletions benchmarks/accuracy/fastchat.sh
Original file line number Diff line number Diff line change
Expand Up @@ -19,16 +19,15 @@ fi

MODEL_DIR=$(realpath $1)
MODEL_ID=$(basename "$MODEL_DIR")_$(date +"%Y%m%d_%H%M%S")
NUM_GPUS_TOTAL=1
NUM_GPUS_TOTAL=4
JUDGMENT_PARALLEL=4

function install_fastchat {
if [[ ! -d "FastChat_TorchAcc" ]]; then
git clone https://github.com/AlibabaPAI/FastChat_TorchAcc.git
fi

output=$(python -m pip list | grep fschat)
if [[ -n $output ]]; then
if python -m pip list | grep -q fschat; then
echo "All requirements are installed."
else
echo "Install requirements ..."
Expand Down
35 changes: 31 additions & 4 deletions benchmarks/accuracy/run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ ORIG_MODEL_EVAL_LOG="$RES_FOLDER/original_model_eval.log"
TORCH_MODEL_EVAL_LOG="$RES_FOLDER/torch_model_eval.log"
ACC_MODEL_EVAL_LOG="$RES_FOLDER/acc_model_eval.log"
RES_LOG_FILE="$RES_FOLDER/result.log"
OSS_BUCKET_PATH="oss://pai-devel/benchmark/accuracy/$(date +'%Y-%m')/$TIMESTAMP"

mkdir -p $RES_FOLDER

Expand Down Expand Up @@ -57,14 +58,18 @@ function upload_to_oss {
curl https://gosspublic.alicdn.com/ossutil/install.sh | bash
fi
ossutil config -e ${OSS_ENDPOINT} -i ${OSS_AK_ID} -k ${OSS_AK_SECRET}
ossutil cp -r -f -j 10 --exclude "*.safetensors" --exclude="*.bin" $RES_FOLDER oss://pai-devel/benchmark/accuracy/"$TIMESTAMP"
ossutil cp -r -f -j 10 --exclude "*.safetensors" --exclude="*.bin" $RES_FOLDER $OSS_BUCKET_PATH
else
echo "No oss information found. Skip uploading to oss."
fi
}


function collect_and_show_results {
is_numeric() {
[[ "$1" =~ ^[0-9]+([.][0-9]+)?$ ]]
}

function collect_and_upload_results {
# Collect and compare the results
ORIG_SCORE=$(tail -1 $ORIG_MODEL_EVAL_LOG | awk '{print $NF}')
TORCH_SCORE=$(tail -1 $TORCH_MODEL_EVAL_LOG | awk '{print $NF}')
Expand Down Expand Up @@ -102,9 +107,31 @@ function collect_and_show_results {
echo -e "\n${CYAN}More details can be found in = ${RESET}${RES_FOLDER}"
echo -e "${BLUE}==========================================================${RESET}"
} | tee >(sed 's/\x1b\[[0-9;]*m//g' > $RES_LOG_FILE)

upload_to_oss

# Check the results
if ! is_numeric "$TORCH_TRAIN_LOSS" || \
! is_numeric "$TORCH_TRAIN_RUNTIME" || \
! is_numeric "$TORCH_TRAIN_STEPS_PER_SECOND" || \
! is_numeric "$ACC_TRAIN_LOSS" || \
! is_numeric "$ACC_TRAIN_RUNTIME" || \
! is_numeric "$ACC_TRAIN_STEPS_PER_SECOND" || \
! is_numeric "$ORIG_SCORE" || \
! is_numeric "$TORCH_SCORE" || \
! is_numeric "$ACC_SCORE" || \ ; then
echo "Error: One or more variables are not numeric."
exit 1
fi

LOSS_DIFF=$(echo "$TORCH_TRAIN_LOSS - $ACC_TRAIN_LOSS" | bc -l)
LOSS_DIFF_ABS=$(echo "${LOSS_DIFF#-}" | bc -l)
if (( $(echo "$LOSS_DIFF_ABS > 0.01" | bc -l) )); then
echo "Error: The difference between ACC_TRAIN_LOSS and TORCH_TRAIN_LOSS exceeds 1e-2."
exit 1
fi
}

do_train
do_evaluation
collect_and_show_results
upload_to_oss
collect_and_upload_results

0 comments on commit 7170d37

Please sign in to comment.