Skip to content

Commit

Permalink
Fix evaluation in MLCube
Browse files Browse the repository at this point in the history
  • Loading branch information
davidjurado committed Apr 13, 2023
1 parent f29b1bb commit 558db03
Show file tree
Hide file tree
Showing 7 changed files with 111 additions and 27 deletions.
1 change: 1 addition & 0 deletions language_model/tensorflow/bert/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ RUN pip install --no-cache-dir -r /requirements.txt
COPY . /workspace/bert

RUN chmod +x /workspace/bert/cleanup_scripts/*.sh
RUN chmod +x /workspace/bert/*.sh

# Set working directory
WORKDIR /workspace/bert
2 changes: 1 addition & 1 deletion language_model/tensorflow/bert/check_logs.sh
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ while [ $# -gt 0 ]; do
shift
done

for filename in $LOG_DIR/*.log; do
for filename in $LOG_DIR/bert_*.log; do
log_file=${filename##*/}
python -m mlperf_logging.compliance_checker $filename --log_output $CHECKER_LOG_DIR/$log_file || true
done
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
import collections
import random
import tokenization
import tensorflow as tf
import tensorflow.compat.v1 as tf

flags = tf.flags

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,9 @@

: "${INPUT_PATH:=/workspace/data/dataset/processed_dataset/results4}"
: "${VOCAB_PATH:=/workspace/data/vocab.txt}"
: "${OUTPUT_PATH:=/workspace/output_data}"
: "${OUTPUT_PATH:=/workspace/tf_data}"
: "${OUTPUT_EVAL_PATH:=/workspace/output_eval_data}"
: "${EVAL_TXT:=/workspace/data/dataset/processed_dataset/results4/eval.txt}"

while [ "$1" != "" ]; do
case $1 in
Expand All @@ -15,6 +17,12 @@ while [ "$1" != "" ]; do
--output_path=*)
OUTPUT_PATH="${1#*=}"
;;
--eval_txt=*)
EVAL_TXT="${1#*=}"
;;
--output_eval_path=*)
OUTPUT_EVAL_PATH="${1#*=}"
;;
esac
shift
done
Expand All @@ -25,18 +33,38 @@ echo "OUTPUT_PATH:" $OUTPUT_PATH

cd cleanup_scripts

for FILE in $INPUT_PATH/part*; do
echo "file: " $FILE
NEW_FILE="$(basename -- $FILE)"
echo "*Processing: " $NEW_FILE
python3 create_pretraining_data.py \
--input_file=$FILE \
--vocab_file=$VOCAB_PATH \
--output_file=$OUTPUT_PATH/$NEW_FILE \
--do_lower_case=True \
--max_seq_length=512 \
--max_predictions_per_seq=76 \
--masked_lm_prob=0.15 \
--random_seed=12345 \
--dupe_factor=10
done
#for FILE in $INPUT_PATH/part*; do
# echo "file: " $FILE
# NEW_FILE="$(basename -- $FILE)"
# echo "*Processing: " $NEW_FILE
# python3 create_pretraining_data.py \
# --input_file=$FILE \
# --vocab_file=$VOCAB_PATH \
# --output_file=$OUTPUT_PATH/$NEW_FILE \
# --do_lower_case=True \
# --max_seq_length=512 \
# --max_predictions_per_seq=76 \
# --masked_lm_prob=0.15 \
# --random_seed=12345 \
# --dupe_factor=10
#done

TEMP_FILE=$OUTPUT_EVAL_PATH/eval_temp
echo "AQUI"
echo $TEMP_FILE

python3 create_pretraining_data.py \
--input_file=$EVAL_TXT \
--output_file=$TEMP_FILE \
--vocab_file=$VOCAB_PATH \
--do_lower_case=True \
--max_seq_length=512 \
--max_predictions_per_seq=76 \
--masked_lm_prob=0.15 \
--random_seed=12345 \
--dupe_factor=10

python3 pick_eval_samples.py \
--input_tfrecord=$TEMP_FILE \
--output_tfrecord=$OUTPUT_EVAL_PATH/eval_10k \
--num_examples_to_pick=10000
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import time
import logging
import collections
import tensorflow as tf
import tensorflow.compat.v1 as tf

parser = argparse.ArgumentParser(
description="Eval sample picker for BERT.")
Expand Down
13 changes: 12 additions & 1 deletion language_model/tensorflow/bert/mlcube/mlcube.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ docker:
# Docker file name within docker build context, default is `Dockerfile`.
build_file: "Dockerfile"
# GPU arguments
gpu_args: "--gpus=all --shm-size 16G"
gpu_args: "--gpus=all"

tasks:
download_data:
Expand All @@ -30,16 +30,27 @@ tasks:
vocab_path:
type: file
default: data/vocab.txt
eval_txt:
type: file
default: data/dataset/processed_dataset/results4/eval.txt
outputs:
output_path: tf_data/
output_eval_path: tf_eval_data/
train:
entrypoint: ./run_and_time.sh -a
parameters:
inputs:
tfdata_path: tf_data/
init_checkpoint:
type: file
default: data/tf2_ckpt/model.ckpt-28252
eval_file:
type: file
default: tf_eval_data/eval_10k
config_path: data/bert_config.json
outputs:
log_dir: logs/
output_dir: final_output/
check_logs:
entrypoint: ./check_logs.sh -a
parameters:
Expand Down
58 changes: 51 additions & 7 deletions language_model/tensorflow/bert/run_and_time.sh
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,11 @@ echo "STARTING TIMING RUN AT $start_fmt"

# Set variables
: "${TFDATA_PATH:=./workspace/output_data}"
: "${INIT_CHECKPOINT:=./workspace/data/tf2_ckpt}"
: "${EVAL_FILE:=./workspace/tf_eval_data/eval_10k}"
: "${CONFIG_PATH:=./workspace/data/bert_config.json}"
: "${LOG_DIR:=./workspace/logs}"
: "${OUTPUT_DIR:=./workspace/final_output}"

# Handle MLCube parameters
while [ $# -gt 0 ]; do
Expand All @@ -22,6 +25,18 @@ while [ $# -gt 0 ]; do
--config_path=*)
CONFIG_PATH="${1#*=}"
;;
--init_checkpoint=*)
INIT_CHECKPOINT="${1#*=}"
;;
--log_dir=*)
LOG_DIR="${1#*=}"
;;
--output_dir=*)
OUTPUT_DIR="${1#*=}"
;;
--eval_file=*)
EVAL_FILE="${1#*=}"
;;
*) ;;
esac
shift
Expand All @@ -33,13 +48,13 @@ echo "running benchmark"
TF_XLA_FLAGS='--tf_xla_auto_jit=2' \
python run_pretraining.py \
--bert_config_file=$CONFIG_PATH \
--output_dir=/tmp/output/ \
--input_file=$TFDATA_PATH \
--output_dir=$OUTPUT_DIR \
--input_file="$TFDATA_PATH/part*" \
--init_checkpoint=$INIT_CHECKPOINT \
--nodo_eval \
--do_train \
--eval_batch_size=8 \
--eval_batch_size=4 \
--learning_rate=0.0001 \
--init_checkpoint=./checkpoint/model.ckpt-28252 \
--iterations_per_loop=1000 \
--max_predictions_per_seq=76 \
--max_seq_length=512 \
Expand All @@ -48,13 +63,42 @@ TF_XLA_FLAGS='--tf_xla_auto_jit=2' \
--optimizer=lamb \
--save_checkpoints_steps=6250 \
--start_warmup_step=0 \
--num_gpus=8 \
--train_batch_size=24
--num_gpus=1 \
--train_batch_size=12 |& tee "$LOG_DIR/train_console.log"

# Copy log file to MLCube log folder
if [ "$LOG_DIR" != "" ]; then
timestamp=$(date +%Y%m%d_%H%M%S)
cp bert.log "$LOG_DIR/bert_train_$timestamp.log"
fi

TF_XLA_FLAGS='--tf_xla_auto_jit=2' \
python3 run_pretraining.py \
--bert_config_file=$CONFIG_PATH \
--output_dir=$OUTPUT_DIR \
--input_file=$EVAL_FILE \
--do_eval \
--nodo_train \
--eval_batch_size=8 \
--init_checkpoint=$OUTPUT_DIR/model.ckpt-107538 \
--iterations_per_loop=1000 \
--learning_rate=0.0001 \
--max_eval_steps=1250 \
--max_predictions_per_seq=76 \
--max_seq_length=512 \
--num_gpus=1 \
--num_train_steps=107538 \
--num_warmup_steps=1562 \
--optimizer=lamb \
--save_checkpoints_steps=1562 \
--start_warmup_step=0 \
--train_batch_size=24 \
--nouse_tpu |& tee "$LOG_DIR/eval_console.log"

# Copy log file to MLCube log folder
if [ "$LOG_DIR" != "" ]; then
timestamp=$(date +%Y%m%d_%H%M%S)
cp mlperf_compliance.log "$LOG_DIR/mlperf_compliance_$timestamp.log"
cp bert.log "$LOG_DIR/bert_eval_$timestamp.log"
fi

set +x
Expand Down

0 comments on commit 558db03

Please sign in to comment.