diff --git a/single_stage_detector/.dockerignore b/single_stage_detector/.dockerignore new file mode 100644 index 000000000..0af9b975f --- /dev/null +++ b/single_stage_detector/.dockerignore @@ -0,0 +1 @@ +mlcube/workspace/ \ No newline at end of file diff --git a/single_stage_detector/mlcube/mlcube.yaml b/single_stage_detector/mlcube/mlcube.yaml index 2e1b412a3..789007c3e 100644 --- a/single_stage_detector/mlcube/mlcube.yaml +++ b/single_stage_detector/mlcube/mlcube.yaml @@ -42,3 +42,16 @@ tasks: log_dir: logs/ outputs: checker_logs_dir: checker_logs/ + download_demo: + entrypoint: ../scripts/download_openimages_demo.sh -a + parameters: + outputs: + data_dir: demo/ + demo: + entrypoint: ./run_demo.sh -a + # torchrun --standalone --nnodes=1 train.py --epochs=1 --batch-size=16 --eval-batch-size=16 + parameters: + inputs: + data_dir: demo/ + outputs: + log_dir: logs/ diff --git a/single_stage_detector/requirements.txt b/single_stage_detector/requirements.txt index ad2d26a46..35e86b944 100644 --- a/single_stage_detector/requirements.txt +++ b/single_stage_detector/requirements.txt @@ -1,6 +1,6 @@ scikit-image>=0.15.0 ujson>=4.0.2 matplotlib>=3.5.1 -pycocotools==2.0.4 +pycocotools>=2.0.4 git+https://github.com/mlcommons/logging.git@1.1.0-rc4 fiftyone==0.15.1 diff --git a/single_stage_detector/scripts/download_openimages_demo.sh b/single_stage_detector/scripts/download_openimages_demo.sh new file mode 100755 index 000000000..3d73ac4f3 --- /dev/null +++ b/single_stage_detector/scripts/download_openimages_demo.sh @@ -0,0 +1,30 @@ +#!/bin/bash + +: "${DATASET_PATH:=/datasets/open-images-v6-mlperf}" + +while [ "$1" != "" ]; do + case $1 in + -d | --dataset-path) + shift + DATASET_PATH=$1 + ;; + --data_dir=*) + if [[ "$PWD" = /workspace/single_stage_detector/ssd ]]; then + cd ../scripts + DATASET_PATH="${1#*=}" + fi + ;; + esac + shift +done + +echo "saving to" +echo $DATASET_PATH +ls $DATASET_PATH + +MLPERF_CLASSES=('Apple' 'Banana') + +python fiftyone_openimages.py \ + --dataset-dir=${DATASET_PATH} \ + --output-labels="openimages-mlperf.json" \ + --classes "${MLPERF_CLASSES[@]}" diff --git a/single_stage_detector/ssd/run_demo.sh b/single_stage_detector/ssd/run_demo.sh new file mode 100755 index 000000000..43be5a0ad --- /dev/null +++ b/single_stage_detector/ssd/run_demo.sh @@ -0,0 +1,114 @@ +#!/bin/bash + +# Copyright (c) 2018-2021, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# runs benchmark and reports time to convergence +# to use the script: +# run_and_time.sh + +set +x +set -e + +# Only rank print +[ "${SLURM_LOCALID-}" -ne 0 ] && set +x + + +# start timing +start=$(date +%s) +start_fmt=$(date +%Y-%m-%d\ %r) +echo "STARTING TIMING RUN AT $start_fmt" + +# Set variables +[ "${DEBUG}" = "1" ] && set -x +BATCHSIZE=${BATCHSIZE:-4} +EVALBATCHSIZE=${EVALBATCHSIZE:-${BATCHSIZE}} +NUMEPOCHS=${NUMEPOCHS:-1} +LOG_INTERVAL=${LOG_INTERVAL:-20} +DATASET_DIR=${DATASET_DIR:-"/datasets/open-images-v6-mlperf"} +TORCH_HOME=${TORCH_HOME:-"$(pwd)/torch-model-cache"} + +# Handle MLCube parameters +while [ $# -gt 0 ]; do + case "$1" in + --data_dir=*) + DATASET_DIR="${1#*=}" + ;; + --log_dir=*) + LOG_DIR="${1#*=}" + ;; + *) + esac + shift +done + + +# run benchmark +echo "running benchmark" + + + +declare -a CMD +if [ -n "${SLURM_LOCALID-}" ]; then + # Mode 1: Slurm launched a task for each GPU and set some envvars; no need for parallel launch + cluster='' + if [[ "${DGXSYSTEM}" == DGX2* ]]; then + cluster='circe' + fi + if [[ "${DGXSYSTEM}" == DGXA100* ]]; then + cluster='selene' + fi + if [ "${SLURM_NTASKS}" -gt "${SLURM_JOB_NUM_NODES}" ]; then + CMD=( './bind.sh' "--cluster=${cluster}" '--ib=single' '--' ${NSYSCMD} 'python' '-u' ) + else + CMD=( 'python' '-u' ) + fi +else + # Mode 2: Single-node Docker; need to launch tasks with torchrun + CMD=( "torchrun" "--standalone" "--nnodes=1" "--nproc_per_node=1" ) + [ "$MEMBIND" = false ] && CMD+=( "--no_membind" ) +fi + +PARAMS=( + --batch-size "${BATCHSIZE}" + --eval-batch-size "${EVALBATCHSIZE}" + --epochs "${NUMEPOCHS}" + --print-freq "${LOG_INTERVAL}" + --data-path "${DATASET_DIR}" +) + +# run training +"${CMD[@]}" train.py "${PARAMS[@]}" ${EXTRA_PARAMS} ; ret_code=$? + +# Copy log file to MLCube log folder +if [ "$LOG_DIR" != "" ]; then + timestamp=$(date +%Y%m%d_%H%M%S) + cp mlperf_compliance.log "$LOG_DIR/mlperf_compliance_$timestamp.log" +fi + +set +x + +sleep 3 +if [[ $ret_code != 0 ]]; then exit $ret_code; fi + +# end timing +end=$(date +%s) +end_fmt=$(date +%Y-%m-%d\ %r) +echo "ENDING TIMING RUN AT $end_fmt" + +# report result +result=$(( $end - $start )) +result_name="SINGLE_STAGE_DETECTOR" + +echo "RESULT,$result_name,,$result,nvidia,$start_fmt"