diff --git a/mlperf_logging/rcp_checker/hpc_3.0.0/rcps_cosmoflow.json b/mlperf_logging/rcp_checker/hpc_3.0.0/rcps_cosmoflow.json new file mode 100644 index 00000000..54949099 --- /dev/null +++ b/mlperf_logging/rcp_checker/hpc_3.0.0/rcps_cosmoflow.json @@ -0,0 +1,82 @@ +{ + "cosmoflow_ref_32": { + "Benchmark": "cosmoflow", + "BS": 32, + "Epochs to converge": [ 8, 8, 9, 8, 6, 8, 6, 6, 7, 8, 8, 8, 9, 8, 7, 8, 6, 9, 8, 8, 8, 8, 9, 8, 9, 8, 8, 8, 8, 8, 8, 8, 8 ], + "Hyperparams": { + "global_batch_size": 32, + "opt_name": "SGD", + "opt_base_learning_rate": 0.001, + "opt_learning_rate_warmup_epochs": 2, + "opt_learning_rate_warmup_factor": 4.0, + "opt_learning_rate_decay_boundary_epochs": [ 4, 6 ], + "opt_learning_rate_decay_factor": [0.25, 0.0625], + "dropout": 0.5, + "opt_weight_decay": 0.0 + } + }, + "cosmoflow_ref_64": { + "Benchmark": "cosmoflow", + "BS": 64, + "Epochs to converge": [ 19, 18, 20, 17, 18, 19, 18, 18, 18, 18, 18, 18, 18, 19, 18, 18, 18, 18, 18, 20 ], + "Hyperparams": { + "global_batch_size": 64, + "opt_name": "SGD", + "opt_base_learning_rate": 0.001, + "opt_learning_rate_warmup_epochs": 4, + "opt_learning_rate_warmup_factor": 1.0, + "opt_learning_rate_decay_boundary_epochs": [ 16, 32 ], + "opt_learning_rate_decay_factor": 0.25, + "dropout": 0.0, + "opt_weight_decay": 0.01 + } + }, + "cosmoflow_ref_128": { + "Benchmark": "cosmoflow", + "BS": 128, + "Epochs to converge": [ 16, 18, 18, 18, 18, 17, 18, 18, 18, 18, 18, 18, 17, 18, 18, 18, 19, 18, 18, 18 ], + "Hyperparams": { + "global_batch_size": 128, + "opt_name": "SGD", + "opt_base_learning_rate": 0.004, + "opt_learning_rate_warmup_epochs": 4, + "opt_learning_rate_warmup_factor": 4.0, + "opt_learning_rate_decay_boundary_epochs": [ 16, 32 ], + "opt_learning_rate_decay_factor": 0.25, + "dropout": 0.5, + "opt_weight_decay": 0.0 + } + }, + "cosmoflow_ref_512": { + "Benchmark": "cosmoflow", + "BS": 512, + "Epochs to converge": [ 22, 23, 23, 24, 24, 21, 23, 23, 23, 23, 24, 23, 23, 21, 23, 23, 21, 23, 23, 23 ], + "Hyperparams": { + "global_batch_size": 512, + "opt_name": "SGD", + "opt_base_learning_rate": 0.006, + "opt_learning_rate_warmup_epochs": 6, + "opt_learning_rate_warmup_factor": 4.0, + "opt_learning_rate_decay_boundary_epochs": [ 19, 21, 22, 23 ], + "opt_learning_rate_decay_factor": [0.5, 0.25, 0.125, 0.0625], + "dropout": 0.0, + "opt_weight_decay": 0.01 + } + }, + "cosmoflow_ref_1024": { + "Benchmark": "cosmoflow", + "BS": 1024, + "Epochs to converge": [ 42, 38, 42, 40, 40, 39, 43, 37, 39, 37, 43, 39, 38, 42, 40, 42, 42, 38, 36, 43 ], + "Hyperparams": { + "global_batch_size": 1024, + "opt_name": "SGD", + "opt_base_learning_rate": 0.012, + "opt_learning_rate_warmup_epochs": 0, + "opt_learning_rate_warmup_factor": 1.0, + "opt_learning_rate_decay_boundary_epochs": [ 32, 64 ], + "opt_learning_rate_decay_factor": 0.25, + "dropout": 0.5, + "opt_weight_decay": 0.0 + } + } +} diff --git a/mlperf_logging/rcp_checker/hpc_3.0.0/rcps_deepcam.json b/mlperf_logging/rcp_checker/hpc_3.0.0/rcps_deepcam.json new file mode 100644 index 00000000..bcc32625 --- /dev/null +++ b/mlperf_logging/rcp_checker/hpc_3.0.0/rcps_deepcam.json @@ -0,0 +1,119 @@ +{ + "deepcam_ref_128": { + "Benchmark": "deepcam", + "BS": 128, + "Epochs to converge": [ 7, 7, 7, 7, 7, 7, 7, 8, 8, 6, 7, 8, 7, 6, 6, 7, 7, 6, 7, 7 ], + "Hyperparams": { + "batchnorm_groupsize": 1, + "global_batch_size": 128, + "gradient_accumulation_frequency": 1, + "num_workers": 32, + "opt_betas": [ 0.9, 0.999 ], + "opt_bias_correction": true, + "opt_eps": 1e-06, + "opt_grad_averaging": true, + "opt_lr": 0.00155, + "opt_max_grad_norm": 1.0, + "opt_weight_decay": 0.01, + "opt_name": "LAMB", + "scheduler_decay_rate": "0.1", + "scheduler_lr_warmup_factor": 1.0, + "scheduler_lr_warmup_steps": 0, + "scheduler_t_max": "9000", + "scheduler_eta_min": "0.0", + "scheduler_type": "cosine_annealing" + } + }, + "deepcam_ref_256": { + "Benchmark": "deepcam", + "BS": 256, + "Epochs to converge": [ 9, 9, 9, 9, 9, 9, 9, 9, 9, 9 ], + "Hyperparams": { + "batchnorm_groupsize": 1, + "global_batch_size": 256, + "gradient_accumulation_frequency": 1, + "opt_betas": [ 0.9, 0.999 ], + "opt_bias_correction": true, + "opt_eps": 1e-06, + "opt_grad_averaging": true, + "opt_lr": 0.002, + "opt_max_grad_norm": 1.0, + "opt_weight_decay": 0.01, + "opt_name": "LAMB", + "scheduler_decay_rate": 0.1, + "scheduler_lr_warmup_factor": 1.0, + "scheduler_lr_warmup_steps": 0, + "scheduler_milestones": [ 4096, 8192 ], + "scheduler_type": "multistep" + } + }, + "deepcam_ref_512": { + "Benchmark": "deepcam", + "BS": 512, + "Epochs to converge": [ 11, 11, 11, 11, 11, 10, 16, 12, 11, 11, 11, 10, 11, 11, 11, 10, 11, 11, 11, 11, 11, 11, 11, 12, 11, 11, 11, 11, 13, 10, 11, 13, 11, 11, 11, 10, 14, 10, 10, 10 ], + "Hyperparams": { + "batchnorm_groupsize": 1, + "global_batch_size": 512, + "gradient_accumulation_frequency": 1, + "opt_betas": [ 0.9, 0.999 ], + "opt_bias_correction": true, + "opt_eps": 1e-06, + "opt_grad_averaging": true, + "opt_lr": 0.004, + "opt_max_grad_norm": 1.0, + "opt_weight_decay": 0.01, + "opt_name": "LAMB", + "scheduler_decay_rate": 0.1, + "scheduler_lr_warmup_factor": 1.0, + "scheduler_lr_warmup_steps": 100, + "scheduler_milestones": [ 2048, 4096 ], + "scheduler_type": "multistep" + } + }, + "deepcam_ref_1024": { + "Benchmark": "deepcam", + "BS": 1024, + "Epochs to converge": [ 13, 13, 12, 13, 13, 13, 14, 13, 13, 13 ], + "Hyperparams": { + "batchnorm_groupsize": 1, + "global_batch_size": 1024, + "gradient_accumulation_frequency": 1, + "opt_betas": [ 0.9, 0.999 ], + "opt_bias_correction": true, + "opt_eps": 1e-06, + "opt_grad_averaging": true, + "opt_lr": 0.004, + "opt_max_grad_norm": 1.0, + "opt_weight_decay": 0.01, + "opt_name": "LAMB", + "scheduler_decay_rate": 0.1, + "scheduler_lr_warmup_factor": 1.0, + "scheduler_lr_warmup_steps": 200, + "scheduler_milestones": [ 1100, 4096 ], + "scheduler_type": "multistep" + } + }, + "deepcam_ref_2048": { + "Benchmark": "deepcam", + "BS": 2048, + "Epochs to converge": [ 23, 23, 22, 23, 22, 26, 22, 22, 24, 22 ], + "Hyperparams": { + "batchnorm_groupsize": 1, + "global_batch_size": 2048, + "gradient_accumulation_frequency": 1, + "opt_betas": [ 0.9, 0.999 ], + "opt_bias_correction": true, + "opt_eps": 1e-06, + "opt_grad_averaging": true, + "opt_lr": 0.0055, + "opt_max_grad_norm": 1.0, + "opt_weight_decay": 0.01, + "opt_name": "LAMB", + "scheduler_decay_rate": "0.1", + "scheduler_lr_warmup_factor": 1.0, + "scheduler_lr_warmup_steps": 400, + "scheduler_milestones": [ 800 ], + "scheduler_type": "multistep" + } + } +} diff --git a/mlperf_logging/rcp_checker/hpc_3.0.0/rcps_oc20.json b/mlperf_logging/rcp_checker/hpc_3.0.0/rcps_oc20.json new file mode 100644 index 00000000..d42987a7 --- /dev/null +++ b/mlperf_logging/rcp_checker/hpc_3.0.0/rcps_oc20.json @@ -0,0 +1,41 @@ +{ + "oc20_ref_256": { + "Benchmark": "oc20", + "BS": 256, + "Epochs to converge": [ 20, 18, 20, 20, 22, 18, 19, 19, 21, 21 ], + "Hyperparams": { + "global_batch_size": 256, + "opt_base_learning_rate": 0.0004, + "opt_learning_rate_warmup_steps": 31252, + "opt_learning_rate_warmup_factor": 0.2, + "opt_learning_rate_decay_boundary_steps": [ 125008, 187512, 250016 ], + "opt_learning_rate_decay_factor": 0.1 + } + }, + "oc20_ref_1024": { + "Benchmark": "oc20", + "BS": 1024, + "Epochs to converge": [ 23, 25, 22, 25, 25, 25, 25, 24, 23, 25 ], + "Hyperparams": { + "global_batch_size": 1024, + "opt_base_learning_rate": 0.0012, + "opt_learning_rate_warmup_steps": 7816, + "opt_learning_rate_warmup_factor": 0.2, + "opt_learning_rate_decay_boundary_steps": [ 31264, 46896 ], + "opt_learning_rate_decay_factor": 0.1 + } + }, + "oc20_ref_2048": { + "Benchmark": "oc20", + "BS": 2048, + "Epochs to converge": [ 33, 32, 33, 33, 33, 34, 33, 33, 30, 33 ], + "Hyperparams": { + "global_batch_size": 2048, + "opt_base_learning_rate": 0.0016, + "opt_learning_rate_warmup_steps": 3908, + "opt_learning_rate_warmup_factor": 0.2, + "opt_learning_rate_decay_boundary_steps": [ 23448, 31264 ], + "opt_learning_rate_decay_factor": 0.1 + } + } +} diff --git a/mlperf_logging/result_summarizer/result_summarizer.py b/mlperf_logging/result_summarizer/result_summarizer.py index 1570e216..17e0e6e3 100644 --- a/mlperf_logging/result_summarizer/result_summarizer.py +++ b/mlperf_logging/result_summarizer/result_summarizer.py @@ -322,7 +322,7 @@ def _compute_strong_scaling_scores(desc, system_folder, usage, ruleset): benchmark_folder_parent = os.path.join( system_folder, 'strong') if usage == 'hpc' else system_folder if not os.path.isdir(benchmark_folder_parent): - return benchmark_scores + return benchmark_scores, benchmark_power_scores for benchmark_folder in _get_sub_folders(benchmark_folder_parent): folder_parts = benchmark_folder.split('/') # Check if this benchmark has power results @@ -404,7 +404,7 @@ def _compute_weak_scaling_scores(desc, system_folder, usage, ruleset): benchmark_power_scores = {} has_power = None if not os.path.isdir(benchmark_folder_parent): - return benchmark_scores + return benchmark_scores, benchmark_power_scores for benchmark_folder in _get_sub_folders(benchmark_folder_parent): folder_parts = benchmark_folder.split('/') benchmark = _benchmark_alias(folder_parts[-1]) @@ -457,13 +457,24 @@ def _compute_weak_scaling_scores(desc, system_folder, usage, ruleset): olympic_avg = _compute_olympic_average( power_scores, 1, 1) if olympic_avg is not None: - benchmark_power_scores[benchmark] = olympic_avg + benchmark_power_scores['{}:{}'.format( + benchmark, + 'time_to_train_all', + )] = olympic_avg + benchmark_power_scores['{}:{}'.format( + benchmark, + 'number_of_models', + )] = olympic_avg + benchmark_power_scores['{}:{}'.format( + benchmark, + 'instance_scale', + )] = olympic_avg _fill_empty_benchmark_scores(benchmark_scores, usage, ruleset, weak_scaling=True) - _fill_empty_benchmark_scores(benchmark_power_scores, usage, ruleset) + _fill_empty_benchmark_scores(benchmark_power_scores, usage, ruleset, weak_scaling=True) return benchmark_scores, benchmark_power_scores @@ -565,7 +576,7 @@ def summarize_results(folder, usage, ruleset, csv_file=None): ruleset, weak_scaling=True) power_summary = _get_empty_summary(usage, ruleset) - power_weak_scaling_summary = _get_empty_summary(usage, ruleset) + power_weak_scaling_summary = _get_empty_summary(usage, ruleset, weak_scaling=True) for system_folder in _get_sub_folders(results_folder): folder_parts = system_folder.split('/') diff --git a/scripts/verify_for_v3.0_hpc.sh b/scripts/verify_for_v3.0_hpc.sh new file mode 100755 index 00000000..9206ef56 --- /dev/null +++ b/scripts/verify_for_v3.0_hpc.sh @@ -0,0 +1,18 @@ +#!/bin/bash + +set -e + +# rcp_bypass and rcp_bert_train_samples packahe checker params +# need to be retrieved at package_checker_params file at top-level submission dir. +PACKAGE_CHECKER_PARAMS="" +PACKAGE_CHECKER_PARAMS_FILE="$1/package_checker_params" +if test -f "$PACKAGE_CHECKER_PARAMS_FILE"; then + while IFS= read -r line + do + PACKAGE_CHECKER_PARAMS="$PACKAGE_CHECKER_PARAMS --$line" + done < "$PACKAGE_CHECKER_PARAMS_FILE" +fi + +python3 -m mlperf_logging.package_checker $1 hpc 3.0.0 $PACKAGE_CHECKER_PARAMS +python3 -m mlperf_logging.result_summarizer $1 hpc 3.0.0 +python3 -m mlperf_logging.repo_checker $1 hpc 3.0.0