Skip to content

Commit

Permalink
Add HPC 3.0 missing files
Browse files Browse the repository at this point in the history
  • Loading branch information
pgmpablo157321 committed Sep 18, 2023
1 parent 7393de9 commit 7099780
Show file tree
Hide file tree
Showing 4 changed files with 260 additions and 0 deletions.
82 changes: 82 additions & 0 deletions mlperf_logging/rcp_checker/hpc_3.0.0/rcps_cosmoflow.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
{
"cosmoflow_ref_32": {
"Benchmark": "cosmoflow",
"BS": 32,
"Epochs to converge": [ 8, 8, 9, 8, 6, 8, 6, 6, 7, 8, 8, 8, 9, 8, 7, 8, 6, 9, 8, 8, 8, 8, 9, 8, 9, 8, 8, 8, 8, 8, 8, 8, 8 ],
"Hyperparams": {
"global_batch_size": 32,
"opt_name": "SGD",
"opt_base_learning_rate": 0.001,
"opt_learning_rate_warmup_epochs": 2,
"opt_learning_rate_warmup_factor": 4.0,
"opt_learning_rate_decay_boundary_epochs": [ 4, 6 ],
"opt_learning_rate_decay_factor": [0.25, 0.0625],
"dropout": 0.5,
"opt_weight_decay": 0.0
}
},
"cosmoflow_ref_64": {
"Benchmark": "cosmoflow",
"BS": 64,
"Epochs to converge": [ 19, 18, 20, 17, 18, 19, 18, 18, 18, 18, 18, 18, 18, 19, 18, 18, 18, 18, 18, 20 ],
"Hyperparams": {
"global_batch_size": 64,
"opt_name": "SGD",
"opt_base_learning_rate": 0.001,
"opt_learning_rate_warmup_epochs": 4,
"opt_learning_rate_warmup_factor": 1.0,
"opt_learning_rate_decay_boundary_epochs": [ 16, 32 ],
"opt_learning_rate_decay_factor": 0.25,
"dropout": 0.0,
"opt_weight_decay": 0.01
}
},
"cosmoflow_ref_128": {
"Benchmark": "cosmoflow",
"BS": 128,
"Epochs to converge": [ 16, 18, 18, 18, 18, 17, 18, 18, 18, 18, 18, 18, 17, 18, 18, 18, 19, 18, 18, 18 ],
"Hyperparams": {
"global_batch_size": 128,
"opt_name": "SGD",
"opt_base_learning_rate": 0.004,
"opt_learning_rate_warmup_epochs": 4,
"opt_learning_rate_warmup_factor": 4.0,
"opt_learning_rate_decay_boundary_epochs": [ 16, 32 ],
"opt_learning_rate_decay_factor": 0.25,
"dropout": 0.5,
"opt_weight_decay": 0.0
}
},
"cosmoflow_ref_512": {
"Benchmark": "cosmoflow",
"BS": 512,
"Epochs to converge": [ 22, 23, 23, 24, 24, 21, 23, 23, 23, 23, 24, 23, 23, 21, 23, 23, 21, 23, 23, 23 ],
"Hyperparams": {
"global_batch_size": 512,
"opt_name": "SGD",
"opt_base_learning_rate": 0.006,
"opt_learning_rate_warmup_epochs": 6,
"opt_learning_rate_warmup_factor": 4.0,
"opt_learning_rate_decay_boundary_epochs": [ 19, 21, 22, 23 ],
"opt_learning_rate_decay_factor": [0.5, 0.25, 0.125, 0.0625],
"dropout": 0.0,
"opt_weight_decay": 0.01
}
},
"cosmoflow_ref_1024": {
"Benchmark": "cosmoflow",
"BS": 1024,
"Epochs to converge": [ 42, 38, 42, 40, 40, 39, 43, 37, 39, 37, 43, 39, 38, 42, 40, 42, 42, 38, 36, 43 ],
"Hyperparams": {
"global_batch_size": 1024,
"opt_name": "SGD",
"opt_base_learning_rate": 0.012,
"opt_learning_rate_warmup_epochs": 0,
"opt_learning_rate_warmup_factor": 1.0,
"opt_learning_rate_decay_boundary_epochs": [ 32, 64 ],
"opt_learning_rate_decay_factor": 0.25,
"dropout": 0.5,
"opt_weight_decay": 0.0
}
}
}
119 changes: 119 additions & 0 deletions mlperf_logging/rcp_checker/hpc_3.0.0/rcps_deepcam.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
{
"deepcam_ref_128": {
"Benchmark": "deepcam",
"BS": 128,
"Epochs to converge": [ 7, 7, 7, 7, 7, 7, 7, 8, 8, 6, 7, 8, 7, 6, 6, 7, 7, 6, 7, 7 ],
"Hyperparams": {
"batchnorm_groupsize": 1,
"global_batch_size": 128,
"gradient_accumulation_frequency": 1,
"num_workers": 32,
"opt_betas": [ 0.9, 0.999 ],
"opt_bias_correction": true,
"opt_eps": 1e-06,
"opt_grad_averaging": true,
"opt_lr": 0.00155,
"opt_max_grad_norm": 1.0,
"opt_weight_decay": 0.01,
"opt_name": "LAMB",
"scheduler_decay_rate": "0.1",
"scheduler_lr_warmup_factor": 1.0,
"scheduler_lr_warmup_steps": 0,
"scheduler_t_max": "9000",
"scheduler_eta_min": "0.0",
"scheduler_type": "cosine_annealing"
}
},
"deepcam_ref_256": {
"Benchmark": "deepcam",
"BS": 256,
"Epochs to converge": [ 9, 9, 9, 9, 9, 9, 9, 9, 9, 9 ],
"Hyperparams": {
"batchnorm_groupsize": 1,
"global_batch_size": 256,
"gradient_accumulation_frequency": 1,
"opt_betas": [ 0.9, 0.999 ],
"opt_bias_correction": true,
"opt_eps": 1e-06,
"opt_grad_averaging": true,
"opt_lr": 0.002,
"opt_max_grad_norm": 1.0,
"opt_weight_decay": 0.01,
"opt_name": "LAMB",
"scheduler_decay_rate": 0.1,
"scheduler_lr_warmup_factor": 1.0,
"scheduler_lr_warmup_steps": 0,
"scheduler_milestones": [ 4096, 8192 ],
"scheduler_type": "multistep"
}
},
"deepcam_ref_512": {
"Benchmark": "deepcam",
"BS": 512,
"Epochs to converge": [ 11, 11, 11, 11, 11, 10, 16, 12, 11, 11, 11, 10, 11, 11, 11, 10, 11, 11, 11, 11, 11, 11, 11, 12, 11, 11, 11, 11, 13, 10, 11, 13, 11, 11, 11, 10, 14, 10, 10, 10 ],
"Hyperparams": {
"batchnorm_groupsize": 1,
"global_batch_size": 512,
"gradient_accumulation_frequency": 1,
"opt_betas": [ 0.9, 0.999 ],
"opt_bias_correction": true,
"opt_eps": 1e-06,
"opt_grad_averaging": true,
"opt_lr": 0.004,
"opt_max_grad_norm": 1.0,
"opt_weight_decay": 0.01,
"opt_name": "LAMB",
"scheduler_decay_rate": 0.1,
"scheduler_lr_warmup_factor": 1.0,
"scheduler_lr_warmup_steps": 100,
"scheduler_milestones": [ 2048, 4096 ],
"scheduler_type": "multistep"
}
},
"deepcam_ref_1024": {
"Benchmark": "deepcam",
"BS": 1024,
"Epochs to converge": [ 13, 13, 12, 13, 13, 13, 14, 13, 13, 13 ],
"Hyperparams": {
"batchnorm_groupsize": 1,
"global_batch_size": 1024,
"gradient_accumulation_frequency": 1,
"opt_betas": [ 0.9, 0.999 ],
"opt_bias_correction": true,
"opt_eps": 1e-06,
"opt_grad_averaging": true,
"opt_lr": 0.004,
"opt_max_grad_norm": 1.0,
"opt_weight_decay": 0.01,
"opt_name": "LAMB",
"scheduler_decay_rate": 0.1,
"scheduler_lr_warmup_factor": 1.0,
"scheduler_lr_warmup_steps": 200,
"scheduler_milestones": [ 1100, 4096 ],
"scheduler_type": "multistep"
}
},
"deepcam_ref_2048": {
"Benchmark": "deepcam",
"BS": 2048,
"Epochs to converge": [ 23, 23, 22, 23, 22, 26, 22, 22, 24, 22 ],
"Hyperparams": {
"batchnorm_groupsize": 1,
"global_batch_size": 2048,
"gradient_accumulation_frequency": 1,
"opt_betas": [ 0.9, 0.999 ],
"opt_bias_correction": true,
"opt_eps": 1e-06,
"opt_grad_averaging": true,
"opt_lr": 0.0055,
"opt_max_grad_norm": 1.0,
"opt_weight_decay": 0.01,
"opt_name": "LAMB",
"scheduler_decay_rate": "0.1",
"scheduler_lr_warmup_factor": 1.0,
"scheduler_lr_warmup_steps": 400,
"scheduler_milestones": [ 800 ],
"scheduler_type": "multistep"
}
}
}
41 changes: 41 additions & 0 deletions mlperf_logging/rcp_checker/hpc_3.0.0/rcps_oc20.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
{
"oc20_ref_256": {
"Benchmark": "oc20",
"BS": 256,
"Epochs to converge": [ 20, 18, 20, 20, 22, 18, 19, 19, 21, 21 ],
"Hyperparams": {
"global_batch_size": 256,
"opt_base_learning_rate": 0.0004,
"opt_learning_rate_warmup_steps": 31252,
"opt_learning_rate_warmup_factor": 0.2,
"opt_learning_rate_decay_boundary_steps": [ 125008, 187512, 250016 ],
"opt_learning_rate_decay_factor": 0.1
}
},
"oc20_ref_1024": {
"Benchmark": "oc20",
"BS": 1024,
"Epochs to converge": [ 23, 25, 22, 25, 25, 25, 25, 24, 23, 25 ],
"Hyperparams": {
"global_batch_size": 1024,
"opt_base_learning_rate": 0.0012,
"opt_learning_rate_warmup_steps": 7816,
"opt_learning_rate_warmup_factor": 0.2,
"opt_learning_rate_decay_boundary_steps": [ 31264, 46896 ],
"opt_learning_rate_decay_factor": 0.1
}
},
"oc20_ref_2048": {
"Benchmark": "oc20",
"BS": 2048,
"Epochs to converge": [ 33, 32, 33, 33, 33, 34, 33, 33, 30, 33 ],
"Hyperparams": {
"global_batch_size": 2048,
"opt_base_learning_rate": 0.0016,
"opt_learning_rate_warmup_steps": 3908,
"opt_learning_rate_warmup_factor": 0.2,
"opt_learning_rate_decay_boundary_steps": [ 23448, 31264 ],
"opt_learning_rate_decay_factor": 0.1
}
}
}
18 changes: 18 additions & 0 deletions scripts/verify_for_v3.0_hpc.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
#!/bin/bash

set -e

# rcp_bypass and rcp_bert_train_samples packahe checker params
# need to be retrieved at package_checker_params file at top-level submission dir.
PACKAGE_CHECKER_PARAMS=""
PACKAGE_CHECKER_PARAMS_FILE="$1/package_checker_params"
if test -f "$PACKAGE_CHECKER_PARAMS_FILE"; then
while IFS= read -r line
do
PACKAGE_CHECKER_PARAMS="$PACKAGE_CHECKER_PARAMS --$line"
done < "$PACKAGE_CHECKER_PARAMS_FILE"
fi

python3 -m mlperf_logging.package_checker $1 hpc 3.0.0 $PACKAGE_CHECKER_PARAMS
python3 -m mlperf_logging.result_summarizer $1 hpc 3.0.0
python3 -m mlperf_logging.repo_checker $1 hpc 3.0.0

0 comments on commit 7099780

Please sign in to comment.