Skip to content

Commit

Permalink
Add quick SSD demo (#683)
Browse files Browse the repository at this point in the history
* Add quick demo

* Fix demo data

* Reduce demo time
  • Loading branch information
davidjurado authored Mar 7, 2024
1 parent e3769c8 commit e237206
Show file tree
Hide file tree
Showing 5 changed files with 159 additions and 1 deletion.
1 change: 1 addition & 0 deletions single_stage_detector/.dockerignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
mlcube/workspace/
13 changes: 13 additions & 0 deletions single_stage_detector/mlcube/mlcube.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -42,3 +42,16 @@ tasks:
log_dir: logs/
outputs:
checker_logs_dir: checker_logs/
download_demo:
entrypoint: ../scripts/download_openimages_demo.sh -a
parameters:
outputs:
data_dir: demo/
demo:
entrypoint: ./run_demo.sh -a
# torchrun --standalone --nnodes=1 train.py --epochs=1 --batch-size=16 --eval-batch-size=16
parameters:
inputs:
data_dir: demo/
outputs:
log_dir: logs/
2 changes: 1 addition & 1 deletion single_stage_detector/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
scikit-image>=0.15.0
ujson>=4.0.2
matplotlib>=3.5.1
pycocotools==2.0.4
pycocotools>=2.0.4
git+https://github.com/mlcommons/logging.git@1.1.0-rc4
fiftyone==0.15.1
30 changes: 30 additions & 0 deletions single_stage_detector/scripts/download_openimages_demo.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
#!/bin/bash

: "${DATASET_PATH:=/datasets/open-images-v6-mlperf}"

while [ "$1" != "" ]; do
case $1 in
-d | --dataset-path)
shift
DATASET_PATH=$1
;;
--data_dir=*)
if [[ "$PWD" = /workspace/single_stage_detector/ssd ]]; then
cd ../scripts
DATASET_PATH="${1#*=}"
fi
;;
esac
shift
done

echo "saving to"
echo $DATASET_PATH
ls $DATASET_PATH

MLPERF_CLASSES=('Apple' 'Banana')

python fiftyone_openimages.py \
--dataset-dir=${DATASET_PATH} \
--output-labels="openimages-mlperf.json" \
--classes "${MLPERF_CLASSES[@]}"
114 changes: 114 additions & 0 deletions single_stage_detector/ssd/run_demo.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
#!/bin/bash

# Copyright (c) 2018-2021, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# runs benchmark and reports time to convergence
# to use the script:
# run_and_time.sh

set +x
set -e

# Only rank print
[ "${SLURM_LOCALID-}" -ne 0 ] && set +x


# start timing
start=$(date +%s)
start_fmt=$(date +%Y-%m-%d\ %r)
echo "STARTING TIMING RUN AT $start_fmt"

# Set variables
[ "${DEBUG}" = "1" ] && set -x
BATCHSIZE=${BATCHSIZE:-4}
EVALBATCHSIZE=${EVALBATCHSIZE:-${BATCHSIZE}}
NUMEPOCHS=${NUMEPOCHS:-1}
LOG_INTERVAL=${LOG_INTERVAL:-20}
DATASET_DIR=${DATASET_DIR:-"/datasets/open-images-v6-mlperf"}
TORCH_HOME=${TORCH_HOME:-"$(pwd)/torch-model-cache"}

# Handle MLCube parameters
while [ $# -gt 0 ]; do
case "$1" in
--data_dir=*)
DATASET_DIR="${1#*=}"
;;
--log_dir=*)
LOG_DIR="${1#*=}"
;;
*)
esac
shift
done


# run benchmark
echo "running benchmark"



declare -a CMD
if [ -n "${SLURM_LOCALID-}" ]; then
# Mode 1: Slurm launched a task for each GPU and set some envvars; no need for parallel launch
cluster=''
if [[ "${DGXSYSTEM}" == DGX2* ]]; then
cluster='circe'
fi
if [[ "${DGXSYSTEM}" == DGXA100* ]]; then
cluster='selene'
fi
if [ "${SLURM_NTASKS}" -gt "${SLURM_JOB_NUM_NODES}" ]; then
CMD=( './bind.sh' "--cluster=${cluster}" '--ib=single' '--' ${NSYSCMD} 'python' '-u' )
else
CMD=( 'python' '-u' )
fi
else
# Mode 2: Single-node Docker; need to launch tasks with torchrun
CMD=( "torchrun" "--standalone" "--nnodes=1" "--nproc_per_node=1" )
[ "$MEMBIND" = false ] && CMD+=( "--no_membind" )
fi

PARAMS=(
--batch-size "${BATCHSIZE}"
--eval-batch-size "${EVALBATCHSIZE}"
--epochs "${NUMEPOCHS}"
--print-freq "${LOG_INTERVAL}"
--data-path "${DATASET_DIR}"
)

# run training
"${CMD[@]}" train.py "${PARAMS[@]}" ${EXTRA_PARAMS} ; ret_code=$?

# Copy log file to MLCube log folder
if [ "$LOG_DIR" != "" ]; then
timestamp=$(date +%Y%m%d_%H%M%S)
cp mlperf_compliance.log "$LOG_DIR/mlperf_compliance_$timestamp.log"
fi

set +x

sleep 3
if [[ $ret_code != 0 ]]; then exit $ret_code; fi

# end timing
end=$(date +%s)
end_fmt=$(date +%Y-%m-%d\ %r)
echo "ENDING TIMING RUN AT $end_fmt"

# report result
result=$(( $end - $start ))
result_name="SINGLE_STAGE_DETECTOR"

echo "RESULT,$result_name,,$result,nvidia,$start_fmt"

0 comments on commit e237206

Please sign in to comment.