Fix evaluation in MLCube

mlcommons · Apr 13, 2023 · 3958143 · 3958143
1 parent f29b1bb
commit 3958143
Show file tree

Hide file tree

Showing 7 changed files with 111 additions and 27 deletions.
diff --git a/language_model/tensorflow/bert/Dockerfile b/language_model/tensorflow/bert/Dockerfile
@@ -15,6 +15,7 @@ RUN pip install --no-cache-dir -r /requirements.txt
 COPY . /workspace/bert
 
 RUN chmod +x /workspace/bert/cleanup_scripts/*.sh
+RUN chmod +x /workspace/bert/*.sh
 
 # Set working directory
 WORKDIR /workspace/bert
diff --git a/language_model/tensorflow/bert/check_logs.sh b/language_model/tensorflow/bert/check_logs.sh
@@ -20,7 +20,7 @@ while [ $# -gt 0 ]; do
     shift
 done
 
-for filename in $LOG_DIR/*.log; do
+for filename in $LOG_DIR/mlperf_compliance*.log; do
     log_file=${filename##*/}
     python -m mlperf_logging.compliance_checker $filename --log_output $CHECKER_LOG_DIR/$log_file || true
 done
diff --git a/language_model/tensorflow/bert/cleanup_scripts/create_pretraining_data.py b/language_model/tensorflow/bert/cleanup_scripts/create_pretraining_data.py
@@ -7,7 +7,7 @@
 import collections
 import random
 import tokenization
-import tensorflow as tf
+import tensorflow.compat.v1 as tf
 
 flags = tf.flags
 

diff --git a/language_model/tensorflow/bert/cleanup_scripts/create_pretraining_data.sh b/language_model/tensorflow/bert/cleanup_scripts/create_pretraining_data.sh
@@ -2,7 +2,9 @@
 
 : "${INPUT_PATH:=/workspace/data/dataset/processed_dataset/results4}"
 : "${VOCAB_PATH:=/workspace/data/vocab.txt}"
-: "${OUTPUT_PATH:=/workspace/output_data}"
+: "${OUTPUT_PATH:=/workspace/tf_data}"
+: "${OUTPUT_EVAL_PATH:=/workspace/output_eval_data}"
+: "${EVAL_TXT:=/workspace/data/dataset/processed_dataset/results4/eval.txt}"
 
 while [ "$1" != "" ]; do
     case $1 in
@@ -15,6 +17,12 @@ while [ "$1" != "" ]; do
     --output_path=*)
         OUTPUT_PATH="${1#*=}"
         ;;
+    --eval_txt=*)
+        EVAL_TXT="${1#*=}"
+        ;;
+    --output_eval_path=*)
+        OUTPUT_EVAL_PATH="${1#*=}"
+        ;;
     esac
     shift
 done
@@ -25,18 +33,38 @@ echo "OUTPUT_PATH:" $OUTPUT_PATH
 
 cd cleanup_scripts
 
-for FILE in $INPUT_PATH/part*; do
-    echo "file: " $FILE
-    NEW_FILE="$(basename -- $FILE)"
-    echo "*Processing: " $NEW_FILE
-    python3 create_pretraining_data.py \
-        --input_file=$FILE \
-        --vocab_file=$VOCAB_PATH \
-        --output_file=$OUTPUT_PATH/$NEW_FILE \
-        --do_lower_case=True \
-        --max_seq_length=512 \
-        --max_predictions_per_seq=76 \
-        --masked_lm_prob=0.15 \
-        --random_seed=12345 \
-        --dupe_factor=10
-done
+#for FILE in $INPUT_PATH/part*; do
+#    echo "file: " $FILE
+#    NEW_FILE="$(basename -- $FILE)"
+#    echo "*Processing: " $NEW_FILE
+#    python3 create_pretraining_data.py \
+#        --input_file=$FILE \
+#        --vocab_file=$VOCAB_PATH \
+#        --output_file=$OUTPUT_PATH/$NEW_FILE \
+#        --do_lower_case=True \
+#        --max_seq_length=512 \
+#        --max_predictions_per_seq=76 \
+#        --masked_lm_prob=0.15 \
+#        --random_seed=12345 \
+#        --dupe_factor=10
+#done
+
+TEMP_FILE=$OUTPUT_EVAL_PATH/eval_temp
+echo "AQUI"
+echo $TEMP_FILE
+
+python3 create_pretraining_data.py \
+  --input_file=$EVAL_TXT \
+  --output_file=$TEMP_FILE \
+  --vocab_file=$VOCAB_PATH \
+  --do_lower_case=True \
+  --max_seq_length=512 \
+  --max_predictions_per_seq=76 \
+  --masked_lm_prob=0.15 \
+  --random_seed=12345 \
+  --dupe_factor=10
+
+python3 pick_eval_samples.py \
+  --input_tfrecord=$TEMP_FILE \
+  --output_tfrecord=$OUTPUT_EVAL_PATH/eval_10k \
+  --num_examples_to_pick=10000
diff --git a/language_model/tensorflow/bert/cleanup_scripts/pick_eval_samples.py b/language_model/tensorflow/bert/cleanup_scripts/pick_eval_samples.py
@@ -5,7 +5,7 @@
 import time
 import logging
 import collections
-import tensorflow as tf
+import tensorflow.compat.v1 as tf
 
 parser = argparse.ArgumentParser(
     description="Eval sample picker for BERT.")

diff --git a/language_model/tensorflow/bert/mlcube/mlcube.yaml b/language_model/tensorflow/bert/mlcube/mlcube.yaml
@@ -14,7 +14,7 @@ docker:
   # Docker file name within docker build context, default is `Dockerfile`.
   build_file: "Dockerfile"
   # GPU arguments
-  gpu_args: "--gpus=all --shm-size 16G"
+  gpu_args: "--gpus=all"
 
 tasks:
   download_data:
@@ -30,16 +30,27 @@ tasks:
         vocab_path:
           type: file
           default: data/vocab.txt
+        eval_txt:
+          type: file
+          default: data/dataset/processed_dataset/results4/eval.txt
       outputs:
         output_path: tf_data/
+        output_eval_path: tf_eval_data/
   train:
     entrypoint: ./run_and_time.sh -a
     parameters:
       inputs:
         tfdata_path: tf_data/
+        init_checkpoint:
+          type: file
+          default: data/tf2_ckpt/model.ckpt-28252
+        eval_file:
+          type: file
+          default: tf_eval_data/eval_10k
         config_path: data/bert_config.json
       outputs:
         log_dir: logs/
+        output_dir: final_output/
   check_logs:
     entrypoint: ./check_logs.sh -a
     parameters:

diff --git a/language_model/tensorflow/bert/run_and_time.sh b/language_model/tensorflow/bert/run_and_time.sh
@@ -10,8 +10,11 @@ echo "STARTING TIMING RUN AT $start_fmt"
 
 # Set variables
 : "${TFDATA_PATH:=./workspace/output_data}"
+: "${INIT_CHECKPOINT:=./workspace/data/tf2_ckpt}"
+: "${EVAL_FILE:=./workspace/tf_eval_data/eval_10k}"
 : "${CONFIG_PATH:=./workspace/data/bert_config.json}"
 : "${LOG_DIR:=./workspace/logs}"
+: "${OUTPUT_DIR:=./workspace/final_output}"
 
 # Handle MLCube parameters
 while [ $# -gt 0 ]; do
@@ -22,6 +25,18 @@ while [ $# -gt 0 ]; do
   --config_path=*)
     CONFIG_PATH="${1#*=}"
     ;;
+  --init_checkpoint=*)
+    INIT_CHECKPOINT="${1#*=}"
+    ;;
+  --log_dir=*)
+    LOG_DIR="${1#*=}"
+    ;;
+  --output_dir=*)
+    OUTPUT_DIR="${1#*=}"
+    ;;
+  --eval_file=*)
+    EVAL_FILE="${1#*=}"
+    ;;
   *) ;;
   esac
   shift
@@ -33,13 +48,13 @@ echo "running benchmark"
 TF_XLA_FLAGS='--tf_xla_auto_jit=2' \
   python run_pretraining.py \
   --bert_config_file=$CONFIG_PATH \
-  --output_dir=/tmp/output/ \
-  --input_file=$TFDATA_PATH \
+  --output_dir=$OUTPUT_DIR \
+  --input_file="$TFDATA_PATH/part*" \
+  --init_checkpoint=$INIT_CHECKPOINT \
   --nodo_eval \
   --do_train \
-  --eval_batch_size=8 \
+  --eval_batch_size=4 \
   --learning_rate=0.0001 \
-  --init_checkpoint=./checkpoint/model.ckpt-28252 \
   --iterations_per_loop=1000 \
   --max_predictions_per_seq=76 \
   --max_seq_length=512 \
@@ -48,13 +63,42 @@ TF_XLA_FLAGS='--tf_xla_auto_jit=2' \
   --optimizer=lamb \
   --save_checkpoints_steps=6250 \
   --start_warmup_step=0 \
-  --num_gpus=8 \
-  --train_batch_size=24
+  --num_gpus=1 \
+  --train_batch_size=12 |& tee "$LOG_DIR/train_console.log"
+
+# Copy log file to MLCube log folder
+if [ "$LOG_DIR" != "" ]; then
+  timestamp=$(date +%Y%m%d_%H%M%S)
+  cp bert.log "$LOG_DIR/bert_train_$timestamp.log"
+fi
+
+TF_XLA_FLAGS='--tf_xla_auto_jit=2' \
+  python3 run_pretraining.py \
+  --bert_config_file=$CONFIG_PATH \
+  --output_dir=$OUTPUT_DIR \
+  --input_file=$EVAL_FILE \
+  --do_eval \
+  --nodo_train \
+  --eval_batch_size=8 \
+  --init_checkpoint=$OUTPUT_DIR/model.ckpt-107538 \
+  --iterations_per_loop=1000 \
+  --learning_rate=0.0001 \
+  --max_eval_steps=1250 \
+  --max_predictions_per_seq=76 \
+  --max_seq_length=512 \
+  --num_gpus=1 \
+  --num_train_steps=107538 \
+  --num_warmup_steps=1562 \
+  --optimizer=lamb \
+  --save_checkpoints_steps=1562 \
+  --start_warmup_step=0 \
+  --train_batch_size=24 \
+  --nouse_tpu |& tee "$LOG_DIR/eval_console.log"
 
 # Copy log file to MLCube log folder
 if [ "$LOG_DIR" != "" ]; then
   timestamp=$(date +%Y%m%d_%H%M%S)
-  cp mlperf_compliance.log "$LOG_DIR/mlperf_compliance_$timestamp.log"
+  cp bert.log "$LOG_DIR/bert_eval_$timestamp.log"
 fi
 
 set +x