Skip to content

Commit

Permalink
Merge pull request #1 from davidjurado/tpu
Browse files Browse the repository at this point in the history
Tpu
  • Loading branch information
davidjurado authored May 18, 2023
2 parents 7a5eb75 + 1c75a26 commit d10b209
Show file tree
Hide file tree
Showing 5 changed files with 114 additions and 129 deletions.
21 changes: 21 additions & 0 deletions language_model/tensorflow/bert/Dockerfile_tpu
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
FROM python:3.8

RUN pip install https://storage.googleapis.com/cloud-tpu-tpuvm-artifacts/tensorflow/tf-2.10.0/tensorflow-2.10.0-cp38-cp38-linux_x86_64.whl
RUN curl -L https://storage.googleapis.com/cloud-tpu-tpuvm-artifacts/libtpu/1.4.0/libtpu.so -o /lib/libtpu.so
RUN apt-get update \
&& apt-get install -y --no-install-recommends \
git libcurl4 wget unzip \
&& rm -rf /var/lib/apt/lists/*

# Install python dependencies
COPY requirements.txt /
RUN pip install --no-cache-dir -r /requirements.txt

# Copy code
COPY . /workspace/bert

RUN chmod +x /workspace/bert/cleanup_scripts/*.sh
RUN chmod +x /workspace/bert/*.sh

# Set working directory
WORKDIR /workspace/bert
17 changes: 1 addition & 16 deletions language_model/tensorflow/bert/mlcube/mlcube.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -54,25 +54,10 @@ tasks:
log_dir: logs/
output_dir: final_output/
train_tpu:
entrypoint: ./run_and_time_tpu.sh -a
entrypoint: ./run_tpu.sh -a
parameters:
inputs:
tfdata_path: tf_data/
init_checkpoint:
type: file
default: data/tf2_ckpt/model.ckpt-28252
eval_file:
type: file
default: tf_eval_data/eval_10k
config_path:
type: file
default: data/bert_config.json
parameters_file:
type: file
default: parameters.yaml
outputs:
log_dir: logs/
output_dir: final_output/
check_logs:
entrypoint: ./check_logs.sh -a
parameters:
Expand Down
3 changes: 2 additions & 1 deletion language_model/tensorflow/bert/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
gdown==4.6.4
git+https://github.com/mlcommons/logging.git@1.1.0-rc4
git+https://github.com/mlcommons/logging.git@1.1.0-rc4
cloud-tpu-client
112 changes: 0 additions & 112 deletions language_model/tensorflow/bert/run_and_time.sh

This file was deleted.

90 changes: 90 additions & 0 deletions language_model/tensorflow/bert/run_tpu.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
#!/bin/bash

set +x
set -e

# start timing
start=$(date +%s)
start_fmt=$(date +%Y-%m-%d\ %r)
echo "STARTING TIMING RUN AT $start_fmt"

# Set variables
: "${TFDATA_PATH:=./workspace/output_data}"
: "${INIT_CHECKPOINT:=./workspace/data/tf2_ckpt}"
: "${EVAL_FILE:=./workspace/tf_eval_data/eval_10k}"
: "${CONFIG_PATH:=./workspace/data/bert_config.json}"
: "${LOG_DIR:=./workspace/logs}"
: "${OUTPUT_DIR:=./workspace/final_output}"

# Handle MLCube parameters
while [ $# -gt 0 ]; do
case "$1" in
--tfdata_path=*)
TFDATA_PATH="${1#*=}"
;;
--config_path=*)
CONFIG_PATH="${1#*=}"
;;
--init_checkpoint=*)
INIT_CHECKPOINT="${1#*=}"
;;
--log_dir=*)
LOG_DIR="${1#*=}"
;;
--output_dir=*)
OUTPUT_DIR="${1#*=}"
;;
--eval_file=*)
EVAL_FILE="${1#*=}"
;;
*) ;;
esac
shift
done

# run benchmark
echo "running benchmark"



python3 ./run_pretraining.py \
--bert_config_file=gs://bert_tf_data/bert_config.json \
--nodo_eval \
--do_train \
--eval_batch_size=64 \
--init_checkpoint=gs://bert_tf_data/tf2_ckpt/model.ckpt-28252 \
--input_file=gs://bert_tf_data/tf_data/part-* \
--iterations_per_loop=1 \
--lamb_beta_1=0.88 \
--lamb_beta_2=0.88 \
--lamb_weight_decay_rate=0.0166629 \
--learning_rate=0.00288293 \
--log_epsilon=-6 \
--max_eval_steps=125 \
--max_predictions_per_seq=76 \
--max_seq_length=512 \
--num_tpu_cores=8 \
--num_train_steps=200 \
--num_warmup_steps=28 \
--optimizer=lamb \
--output_dir=gs://bert_tf_data/output/ \
--save_checkpoints_steps=3 \
--start_warmup_step=-76 \
--steps_per_update=1 \
--train_batch_size=128 \
--use_tpu \
--tpu_name=tpu \
--tpu_zone=us-central1-c \
--gcp_project=training-reference-bench-test |& tee "$LOG_DIR/train_console.log"



set +x

sleep 3
if [[ $ret_code != 0 ]]; then exit $ret_code; fi

# end timing
end=$(date +%s)
end_fmt=$(date +%Y-%m-%d\ %r)
echo "ENDING TIMING RUN AT $end_fmt"

0 comments on commit d10b209

Please sign in to comment.