Skip to content

Commit

Permalink
Add support for compliance for 4.1 (#382)
Browse files Browse the repository at this point in the history
Some changes slipped and were not changed in #381
This PR fixes it.
  • Loading branch information
mmarcinkiewicz authored Sep 10, 2024
1 parent 5190494 commit 9b5f1b4
Show file tree
Hide file tree
Showing 26 changed files with 828 additions and 30 deletions.
11 changes: 10 additions & 1 deletion mlperf_logging/benchmark_meta.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,7 +122,16 @@
'llama2_70b_lora',
'stable_diffusion',
'gnn'
]
],
'4.1': [
'bert',
'dlrm_dcnv2',
'gpt3',
'ssd',
'stable_diffusion',
'llama2_70b_lora',
'gnn'
]
},

'hpc': {
Expand Down
42 changes: 19 additions & 23 deletions mlperf_logging/compliance_checker/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,9 @@ To check a log file for compliance:

python -m mlperf_logging.compliance_checker [--config YAML] [--usage training/hpc] [--ruleset MLPERF_EDITION] FILENAME

By default, 3.1.0 training edition rules are used and the default config is set to `3.1.0/common.yaml`.
By default, 3.1.0 training edition rules are used and the default config is set to `4.1.0/common.yaml`.
This config will check all common keys and enqueue benchmark specific config to be checked as well.
Old training editions, still supported are 3.0.0, 2.1.0, 2.0.0, 1.1.0, 1.0.0, 0.7.0 and 0.6.0
Old training editions, still supported are 4.0.0, 3.1.0, 3.0.0, 2.1.0, 2.0.0, 1.1.0, 1.0.0, 0.7.0 and 0.6.0

To check hpc compliance rules (only 1.0.0 ruleset is supported), set --usage hpc --ruleset 1.0.0.

Expand All @@ -22,27 +22,23 @@ As log examples use [NVIDIA's training logs](https://github.com/mlperf/training_

### Existing config files for training submissions

3.1.0/common.yaml - currently the default config file, checks common fields complience and equeues benchmark-specific config file
3.1.0/closed_common.yaml - the common rules file for closed submissions. These rules apply to all benchmarks
3.1.0/open_common.yaml - the common rules file for open submissions. These rules apply to all benchmarks
3.1.0/closed_resnet.yaml - Per-benchmark rules, closed submissions.
3.1.0/closed_ssd.yaml
3.1.0/closed_maskrcnn.yaml
3.1.0/closed_rnnt.yaml
3.1.0/closed_unet3d.yaml
3.1.0/closed_bert.yaml
3.1.0/closed_dlrm_dcnv2.yaml
3.1.0/closed_gpt3.yaml
3.1.0/closed_stable_diffusion.yaml
3.1.0/open_resnet.yaml - Per-benchmark rules, closed submissions.
3.1.0/open_ssd.yaml
3.1.0/open_maskrcnn.yaml
3.1.0/open_rnnt.yaml
3.1.0/open_unet3d.yaml
3.1.0/open_bert.yaml
3.1.0/open_dlrm_dcnv2.yaml
3.1.0/open_gpt3.yaml
3.1.0/open_stable_diffusion.yaml
4.1.0/common.yaml - currently the default config file, checks common fields complience and equeues benchmark-specific config file
4.1.0/closed_common.yaml - the common rules file for closed submissions. These rules apply to all benchmarks
4.1.0/open_common.yaml - the common rules file for open submissions. These rules apply to all benchmarks
4.1.0/closed_ssd.yaml - Per-benchmark rules, closed submissions.
4.1.0/closed_bert.yaml
4.1.0/closed_dlrm_dcnv2.yaml
4.1.0/closed_gpt3.yaml
4.1.0/closed_gnn.yaml
4.1.0/closed_llama2_70b_lora.yaml
4.1.0/closed_stable_diffusion.yaml
4.1.0/open_ssd.yaml - Per-benchmark rules, closed submissions.
4.1.0/open_bert.yaml
4.1.0/open_dlrm_dcnv2.yaml
4.1.0/open_gpt3.yaml
4.1.0/open_gnn.yaml
4.1.0/open_llama2_70b_lora.yaml
4.1.0/open_stable_diffusion.yaml

### Existing config files for HPC submissions

Expand Down
5 changes: 4 additions & 1 deletion mlperf_logging/compliance_checker/mlp_parser/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from .ruleset_300 import parse_file as parse_file_300
from .ruleset_310 import parse_file as parse_file_310
from .ruleset_400 import parse_file as parse_file_400
from .ruleset_410 import parse_file as parse_file_410


def parse_file(filename, ruleset='0.6.0'):
Expand All @@ -28,5 +29,7 @@ def parse_file(filename, ruleset='0.6.0'):
return parse_file_310(filename)
elif ruleset == '4.0.0':
return parse_file_400(filename)
elif ruleset == '4.1.0':
return parse_file_410(filename)
else:
raise Exception(f'Ruleset "{ruleset}" is not supported')
raise Exception(f'Ruleset "{ruleset}" is not supported')
105 changes: 105 additions & 0 deletions mlperf_logging/compliance_checker/mlp_parser/ruleset_410.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
'''
Parses a text MLPerf log into a structured format.
'''

from __future__ import print_function

import collections
import json
import re
import sys
from dataclasses import dataclass

from io import open

@dataclass
class LogLine:
"""Class for keeping track of an item in inventory."""
full_string: str
timestamp: float
key: str
value: str
lineno: int

TOKEN = ':::MLLOG '


def parse_line(line):
if not line.startswith(TOKEN):
return None

return json.loads(line[len(TOKEN):])


def string_to_logline(lineno, string):
''' Returns a LogLine or raises a ValueError '''
m = parse_line(string)

if m is None:
raise ValueError('does not match regex')

args = []
args.append(string) # full string

ts = float(m['time_ms']) # may raise error, e.g. "1.2.3"
# TODO check for weird values
args.append(ts)

args.append(m['key']) # key

j = { 'value': m['value'], 'metadata': m['metadata'] }
args.append(j)

args.append(lineno)
return LogLine(*args)


def parse_file(filename):
''' Reads a file by name and returns list of loglines and list of errors'''
with open(filename, encoding='latin-1') as f:
return parse_generator(f)


def strip_and_dedup(gen):
lines = []
for l in gen:
if TOKEN not in l:
continue
lines.append(re.sub(".*"+TOKEN, TOKEN, l))
return lines



def parse_generator(gen):
''' Reads a generator of lines and returns (loglines, errors)
The list of errors are any parsing issues as a tuple (str_line, error_msg)
'''
loglines = []
failed = []
for lineno, line in enumerate(strip_and_dedup(gen)):
line = line.strip()
try:
ll = string_to_logline(lineno, line)
loglines.append(ll)
except ValueError as e:
failed.append((line, str(e)))
return loglines, failed


if __name__ == '__main__':
if len(sys.argv) != 2:
print('usage: mlp_parser.py FILENAME')
print(' tests parsing on the file.')
sys.exit(1)

filename = sys.argv[1]
lines, errors = parse_file(filename)

print('Parsed {} log lines with {} errors.'.format(len(lines), len(errors)))

if len(errors) > 0:
print('Lines which failed to parse:')
for line, error in errors:
print(' Following line failed: {}'.format(error))
print(line)

48 changes: 48 additions & 0 deletions mlperf_logging/compliance_checker/training_4.1.0/closed_bert.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
- KEY:
NAME: global_batch_size
REQ: EXACTLY_ONE
POST: >
s['global_batch_size'] = v['value']
- KEY:
NAME: opt_base_learning_rate
REQ: EXACTLY_ONE

- KEY:
NAME: opt_lamb_epsilon
REQ: EXACTLY_ONE

- KEY:
NAME: opt_learning_rate_training_steps
REQ: EXACTLY_ONE

- KEY:
NAME: opt_learning_rate_warmup_steps
REQ: EXACTLY_ONE

- KEY:
NAME: num_warmup_steps
REQ: EXACTLY_ONE

- KEY:
NAME: start_warmup_step
REQ: EXACTLY_ONE

- KEY:
NAME: opt_lamb_beta_1
REQ: EXACTLY_ONE

- KEY:
NAME: opt_lamb_beta_2
REQ: EXACTLY_ONE

- KEY:
NAME: opt_lamb_weight_decay_rate
REQ: EXACTLY_ONE

- KEY:
NAME: eval_accuracy
REQ: AT_LEAST_ONE
CHECK:
- "'epoch_num' in v['metadata']"
ATLEAST_ONE_CHECK: "(v['value'] >= 0.720) and v['value'] < 1.0"
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@

- KEY:
NAME: submission_benchmark
REQ: EXACTLY_ONE
CHECK: " v['value'] in ['resnet', 'ssd', 'stable_diffusion', 'maskrcnn', 'gpt3', 'dlrm_dcnv2', 'bert', 'rnnt', 'unet3d', 'gnn','llama2_70b_lora'] "
POST: " enqueue_config('training_4.1.0/closed_{}.yaml'.format(v['value'])) "

- KEY:
NAME: gradient_accumulation_steps
REQ: EXACTLY_ONE
CHECK: " v['value'] > 0 "
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
- KEY:
NAME: global_batch_size
REQ: EXACTLY_ONE

- KEY:
NAME: opt_name
REQ: EXACTLY_ONE
CHECK: " v['value'] == 'adagrad' "

- KEY:
NAME: opt_base_learning_rate
REQ: EXACTLY_ONE

- KEY:
NAME: opt_adagrad_learning_rate_decay
REQ: EXACTLY_ONE
CHECK: " v['value'] == 0 "

- KEY:
NAME: opt_weight_decay
REQ: EXACTLY_ONE
CHECK: " v['value'] == 0 "

- KEY:
NAME: opt_adagrad_initial_accumulator_value
REQ: EXACTLY_ONE
CHECK: " v['value'] == 0 "

- KEY:
NAME: opt_adagrad_epsilon
REQ: EXACTLY_ONE
CHECK: " v['value'] == 1e-8 "

- KEY:
NAME: opt_learning_rate_warmup_steps
REQ: EXACTLY_ONE
CHECK: " v['value'] == 0 "

- KEY:
NAME: opt_learning_rate_decay_start_step
REQ: EXACTLY_ONE
CHECK: " v['value'] == 0 "

- KEY:
NAME: opt_learning_rate_decay_steps
REQ: EXACTLY_ONE
CHECK: " v['value'] == 0 "

- KEY:
NAME: eval_accuracy
REQ: AT_LEAST_ONE
CHECK:
- "'epoch_num' in v['metadata']"
ATLEAST_ONE_CHECK: "v['value'] >= 0.80275 and v['value'] <= 1.0"

- KEY:
NAME: eval_samples
REQ: EXACTLY_ONE
CHECK: " v['value'] == 89137319 "
21 changes: 21 additions & 0 deletions mlperf_logging/compliance_checker/training_4.1.0/closed_gnn.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
- KEY:
NAME: global_batch_size
REQ: EXACTLY_ONE
CHECK: " v['value'] > 0"

- KEY:
NAME: opt_name
REQ: EXACTLY_ONE
CHECK: " v['value'] == 'adam' "

- KEY:
NAME: opt_base_learning_rate
REQ: EXACTLY_ONE
CHECK: " v['value'] >= 0.0"

- KEY:
NAME: eval_accuracy
REQ: AT_LEAST_ONE
CHECK:
- "'epoch_num' in v['metadata']"
ATLEAST_ONE_CHECK: "v['value'] >= 0.72 and v['value'] < 1.0"
Loading

0 comments on commit 9b5f1b4

Please sign in to comment.