Add quick SSD demo (#683)

* Add quick demo * Fix demo data * Reduce demo time
mlcommons · Mar 7, 2024 · e237206 · e237206
1 parent e3769c8
commit e237206
Show file tree

Hide file tree

Showing 5 changed files with 159 additions and 1 deletion.
diff --git a/single_stage_detector/.dockerignore b/single_stage_detector/.dockerignore
@@ -0,0 +1 @@
+mlcube/workspace/
diff --git a/single_stage_detector/mlcube/mlcube.yaml b/single_stage_detector/mlcube/mlcube.yaml
@@ -42,3 +42,16 @@ tasks:
         log_dir: logs/
       outputs:
         checker_logs_dir: checker_logs/
+  download_demo:
+    entrypoint: ../scripts/download_openimages_demo.sh -a
+    parameters:
+      outputs:
+        data_dir: demo/
+  demo:
+    entrypoint: ./run_demo.sh -a
+    # torchrun --standalone --nnodes=1 train.py --epochs=1 --batch-size=16 --eval-batch-size=16
+    parameters:
+      inputs:
+        data_dir: demo/
+      outputs:
+        log_dir: logs/
diff --git a/single_stage_detector/requirements.txt b/single_stage_detector/requirements.txt
@@ -1,6 +1,6 @@
 scikit-image>=0.15.0
 ujson>=4.0.2
 matplotlib>=3.5.1
-pycocotools==2.0.4
+pycocotools>=2.0.4
 git+https://github.com/mlcommons/logging.git@1.1.0-rc4
 fiftyone==0.15.1
diff --git a/single_stage_detector/scripts/download_openimages_demo.sh b/single_stage_detector/scripts/download_openimages_demo.sh
@@ -0,0 +1,30 @@
+#!/bin/bash
+
+: "${DATASET_PATH:=/datasets/open-images-v6-mlperf}"
+
+while [ "$1" != "" ]; do
+  case $1 in
+  -d | --dataset-path)
+    shift
+    DATASET_PATH=$1
+    ;;
+  --data_dir=*)
+    if [[ "$PWD" = /workspace/single_stage_detector/ssd ]]; then
+      cd ../scripts
+      DATASET_PATH="${1#*=}"
+    fi
+    ;;
+  esac
+  shift
+done
+
+echo "saving to"
+echo $DATASET_PATH
+ls $DATASET_PATH
+
+MLPERF_CLASSES=('Apple' 'Banana')
+
+python fiftyone_openimages.py \
+  --dataset-dir=${DATASET_PATH} \
+  --output-labels="openimages-mlperf.json" \
+  --classes "${MLPERF_CLASSES[@]}"
diff --git a/single_stage_detector/ssd/run_demo.sh b/single_stage_detector/ssd/run_demo.sh
@@ -0,0 +1,114 @@
+#!/bin/bash
+
+# Copyright (c) 2018-2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# runs benchmark and reports time to convergence
+# to use the script:
+#   run_and_time.sh
+
+set +x
+set -e
+
+# Only rank print
+[ "${SLURM_LOCALID-}" -ne 0 ] && set +x
+
+
+# start timing
+start=$(date +%s)
+start_fmt=$(date +%Y-%m-%d\ %r)
+echo "STARTING TIMING RUN AT $start_fmt"
+
+# Set variables
+[ "${DEBUG}" = "1" ] && set -x
+BATCHSIZE=${BATCHSIZE:-4}
+EVALBATCHSIZE=${EVALBATCHSIZE:-${BATCHSIZE}}
+NUMEPOCHS=${NUMEPOCHS:-1}
+LOG_INTERVAL=${LOG_INTERVAL:-20}
+DATASET_DIR=${DATASET_DIR:-"/datasets/open-images-v6-mlperf"}
+TORCH_HOME=${TORCH_HOME:-"$(pwd)/torch-model-cache"}
+
+# Handle MLCube parameters
+while [ $# -gt 0 ]; do
+  case "$1" in
+    --data_dir=*)
+      DATASET_DIR="${1#*=}"
+      ;;
+    --log_dir=*)
+      LOG_DIR="${1#*=}"
+      ;;
+    *)
+  esac
+  shift
+done
+
+
+# run benchmark
+echo "running benchmark"
+
+
+
+declare -a CMD
+if [ -n "${SLURM_LOCALID-}" ]; then
+    # Mode 1: Slurm launched a task for each GPU and set some envvars; no need for parallel launch
+    cluster=''
+    if [[ "${DGXSYSTEM}" == DGX2* ]]; then
+        cluster='circe'
+    fi
+    if [[ "${DGXSYSTEM}" == DGXA100* ]]; then
+        cluster='selene'
+    fi
+  if [ "${SLURM_NTASKS}" -gt "${SLURM_JOB_NUM_NODES}" ]; then
+    CMD=( './bind.sh' "--cluster=${cluster}" '--ib=single' '--' ${NSYSCMD} 'python' '-u' )
+  else
+    CMD=( 'python' '-u' )
+  fi
+else
+  # Mode 2: Single-node Docker; need to launch tasks with torchrun
+  CMD=( "torchrun" "--standalone" "--nnodes=1" "--nproc_per_node=1" )
+  [ "$MEMBIND" = false ] &&  CMD+=( "--no_membind" )
+fi
+
+PARAMS=(
+      --batch-size              "${BATCHSIZE}"
+      --eval-batch-size         "${EVALBATCHSIZE}"
+      --epochs                  "${NUMEPOCHS}"
+      --print-freq              "${LOG_INTERVAL}"
+      --data-path               "${DATASET_DIR}"
+)
+
+# run training
+"${CMD[@]}" train.py "${PARAMS[@]}" ${EXTRA_PARAMS} ; ret_code=$?
+
+# Copy log file to MLCube log folder
+if [ "$LOG_DIR" != "" ]; then
+  timestamp=$(date +%Y%m%d_%H%M%S)
+  cp mlperf_compliance.log "$LOG_DIR/mlperf_compliance_$timestamp.log"
+fi
+
+set +x
+
+sleep 3
+if [[ $ret_code != 0 ]]; then exit $ret_code; fi
+
+# end timing
+end=$(date +%s)
+end_fmt=$(date +%Y-%m-%d\ %r)
+echo "ENDING TIMING RUN AT $end_fmt"
+
+# report result
+result=$(( $end - $start ))
+result_name="SINGLE_STAGE_DETECTOR"
+
+echo "RESULT,$result_name,,$result,nvidia,$start_fmt"