Add support for compliance for 4.1 (#382)

Some changes slipped and were not changed in #381 This PR fixes it.
mlcommons · Sep 10, 2024 · 9b5f1b4 · 9b5f1b4
1 parent 5190494
commit 9b5f1b4
Show file tree

Hide file tree

Showing 26 changed files with 828 additions and 30 deletions.
diff --git a/mlperf_logging/benchmark_meta.py b/mlperf_logging/benchmark_meta.py
@@ -122,7 +122,16 @@
         'llama2_70b_lora',
         'stable_diffusion',
         'gnn'
-    ]
+    ],
+    '4.1': [
+        'bert',
+        'dlrm_dcnv2',
+        'gpt3',        
+        'ssd',        
+        'stable_diffusion',
+        'llama2_70b_lora',
+        'gnn'
+    ]    
     },
 
     'hpc': {

diff --git a/mlperf_logging/compliance_checker/README.md b/mlperf_logging/compliance_checker/README.md
@@ -10,9 +10,9 @@ To check a log file for compliance:
 
     python -m mlperf_logging.compliance_checker [--config YAML] [--usage training/hpc] [--ruleset MLPERF_EDITION] FILENAME
 
-By default, 3.1.0 training edition rules are used and the default config is set to `3.1.0/common.yaml`.
+By default, 3.1.0 training edition rules are used and the default config is set to `4.1.0/common.yaml`.
 This config will check all common keys and enqueue benchmark specific config to be checked as well.
-Old training editions, still supported are 3.0.0, 2.1.0, 2.0.0, 1.1.0, 1.0.0, 0.7.0 and 0.6.0
+Old training editions, still supported are 4.0.0, 3.1.0, 3.0.0, 2.1.0, 2.0.0, 1.1.0, 1.0.0, 0.7.0 and 0.6.0
 
 To check hpc compliance rules (only 1.0.0 ruleset is supported), set --usage hpc --ruleset 1.0.0.
 
@@ -22,27 +22,23 @@ As log examples use [NVIDIA's training logs](https://github.com/mlperf/training_
 
 ### Existing config files for training submissions
 
-    3.1.0/common.yaml          - currently the default config file, checks common fields complience and equeues benchmark-specific config file
-    3.1.0/closed_common.yaml   - the common rules file for closed submissions. These rules apply to all benchmarks
-    3.1.0/open_common.yaml     - the common rules file for open submissions. These rules apply to all benchmarks
-    3.1.0/closed_resnet.yaml   - Per-benchmark rules, closed submissions.
-    3.1.0/closed_ssd.yaml
-    3.1.0/closed_maskrcnn.yaml
-    3.1.0/closed_rnnt.yaml
-    3.1.0/closed_unet3d.yaml
-    3.1.0/closed_bert.yaml
-    3.1.0/closed_dlrm_dcnv2.yaml
-    3.1.0/closed_gpt3.yaml
-    3.1.0/closed_stable_diffusion.yaml
-    3.1.0/open_resnet.yaml   - Per-benchmark rules, closed submissions.
-    3.1.0/open_ssd.yaml
-    3.1.0/open_maskrcnn.yaml
-    3.1.0/open_rnnt.yaml
-    3.1.0/open_unet3d.yaml
-    3.1.0/open_bert.yaml
-    3.1.0/open_dlrm_dcnv2.yaml
-    3.1.0/open_gpt3.yaml
-    3.1.0/open_stable_diffusion.yaml
+    4.1.0/common.yaml          - currently the default config file, checks common fields complience and equeues benchmark-specific config file
+    4.1.0/closed_common.yaml   - the common rules file for closed submissions. These rules apply to all benchmarks
+    4.1.0/open_common.yaml     - the common rules file for open submissions. These rules apply to all benchmarks
+    4.1.0/closed_ssd.yaml   - Per-benchmark rules, closed submissions.    
+    4.1.0/closed_bert.yaml
+    4.1.0/closed_dlrm_dcnv2.yaml
+    4.1.0/closed_gpt3.yaml
+    4.1.0/closed_gnn.yaml
+    4.1.0/closed_llama2_70b_lora.yaml
+    4.1.0/closed_stable_diffusion.yaml
+    4.1.0/open_ssd.yaml   - Per-benchmark rules, closed submissions.    
+    4.1.0/open_bert.yaml
+    4.1.0/open_dlrm_dcnv2.yaml
+    4.1.0/open_gpt3.yaml
+    4.1.0/open_gnn.yaml
+    4.1.0/open_llama2_70b_lora.yaml
+    4.1.0/open_stable_diffusion.yaml
 
 ### Existing config files for HPC submissions
 

diff --git a/mlperf_logging/compliance_checker/mlp_parser/__init__.py b/mlperf_logging/compliance_checker/mlp_parser/__init__.py
@@ -7,6 +7,7 @@
 from .ruleset_300 import parse_file as parse_file_300
 from .ruleset_310 import parse_file as parse_file_310
 from .ruleset_400 import parse_file as parse_file_400
+from .ruleset_410 import parse_file as parse_file_410
 
 
 def parse_file(filename, ruleset='0.6.0'):
@@ -28,5 +29,7 @@ def parse_file(filename, ruleset='0.6.0'):
         return parse_file_310(filename)
     elif ruleset == '4.0.0':
         return parse_file_400(filename)    
+    elif ruleset == '4.1.0':
+        return parse_file_410(filename)
     else:
-        raise Exception(f'Ruleset "{ruleset}" is not supported')
+        raise Exception(f'Ruleset "{ruleset}" is not supported')
diff --git a/mlperf_logging/compliance_checker/mlp_parser/ruleset_410.py b/mlperf_logging/compliance_checker/mlp_parser/ruleset_410.py
@@ -0,0 +1,105 @@
+'''
+Parses a text MLPerf log into a structured format.
+'''
+
+from __future__ import print_function
+
+import collections
+import json
+import re
+import sys
+from dataclasses import dataclass
+
+from io import open
+
+@dataclass
+class LogLine:
+    """Class for keeping track of an item in inventory."""
+    full_string: str
+    timestamp: float
+    key: str
+    value: str
+    lineno: int
+
+TOKEN = ':::MLLOG '
+
+
+def parse_line(line):
+    if not line.startswith(TOKEN):
+        return None
+
+    return json.loads(line[len(TOKEN):])
+
+
+def string_to_logline(lineno, string):
+    ''' Returns a LogLine or raises a ValueError '''
+    m = parse_line(string)
+
+    if m is None:
+        raise ValueError('does not match regex')
+
+    args = []
+    args.append(string) # full string
+
+    ts = float(m['time_ms']) # may raise error, e.g. "1.2.3"
+    # TODO check for weird values
+    args.append(ts)
+
+    args.append(m['key']) # key
+
+    j = { 'value': m['value'], 'metadata': m['metadata'] }
+    args.append(j)
+
+    args.append(lineno)
+    return LogLine(*args)
+
+
+def parse_file(filename):
+    ''' Reads a file by name and returns list of loglines and list of errors'''
+    with open(filename, encoding='latin-1') as f:
+        return parse_generator(f)
+
+
+def strip_and_dedup(gen):
+    lines = []
+    for l in gen:
+        if TOKEN not in l:
+            continue
+        lines.append(re.sub(".*"+TOKEN, TOKEN, l))
+    return lines
+
+
+
+def parse_generator(gen):
+    ''' Reads a generator of lines and returns (loglines, errors)
+    The list of errors are any parsing issues as a tuple (str_line, error_msg)
+    '''
+    loglines = []
+    failed = []
+    for lineno, line in enumerate(strip_and_dedup(gen)):
+        line = line.strip()
+        try:
+            ll = string_to_logline(lineno, line)
+            loglines.append(ll)
+        except ValueError as e:
+            failed.append((line, str(e)))
+    return loglines, failed
+
+
+if __name__ == '__main__':
+    if len(sys.argv) != 2:
+        print('usage: mlp_parser.py FILENAME')
+        print('       tests parsing on the file.')
+        sys.exit(1)
+
+    filename = sys.argv[1]
+    lines, errors = parse_file(filename)
+
+    print('Parsed {} log lines with {} errors.'.format(len(lines), len(errors)))
+
+    if len(errors) > 0:
+        print('Lines which failed to parse:')
+        for line, error in errors:
+            print('  Following line failed: {}'.format(error))
+            print(line)
+
diff --git a/mlperf_logging/compliance_checker/training_4.1.0/closed_bert.yaml b/mlperf_logging/compliance_checker/training_4.1.0/closed_bert.yaml
@@ -0,0 +1,48 @@
+- KEY:
+    NAME:  global_batch_size
+    REQ:   EXACTLY_ONE
+    POST: >
+        s['global_batch_size'] = v['value']
+
+- KEY:
+    NAME:  opt_base_learning_rate
+    REQ:   EXACTLY_ONE
+
+- KEY:
+    NAME:  opt_lamb_epsilon
+    REQ:   EXACTLY_ONE
+
+- KEY:
+    NAME:  opt_learning_rate_training_steps
+    REQ:   EXACTLY_ONE
+
+- KEY:
+    NAME:  opt_learning_rate_warmup_steps
+    REQ:   EXACTLY_ONE
+
+- KEY:
+    NAME:  num_warmup_steps
+    REQ:   EXACTLY_ONE
+
+- KEY:
+    NAME:  start_warmup_step
+    REQ:   EXACTLY_ONE
+
+- KEY:
+    NAME:  opt_lamb_beta_1
+    REQ:   EXACTLY_ONE
+
+- KEY:
+    NAME:  opt_lamb_beta_2
+    REQ:   EXACTLY_ONE
+
+- KEY:
+    NAME:  opt_lamb_weight_decay_rate
+    REQ:   EXACTLY_ONE
+
+- KEY:
+    NAME:  eval_accuracy
+    REQ:   AT_LEAST_ONE
+    CHECK:
+        - "'epoch_num' in v['metadata']"
+    ATLEAST_ONE_CHECK: "(v['value'] >= 0.720) and v['value'] < 1.0"
diff --git a/mlperf_logging/compliance_checker/training_4.1.0/closed_common.yaml b/mlperf_logging/compliance_checker/training_4.1.0/closed_common.yaml
@@ -0,0 +1,11 @@
+
+- KEY:
+    NAME:  submission_benchmark
+    REQ:   EXACTLY_ONE
+    CHECK: " v['value'] in ['resnet', 'ssd', 'stable_diffusion', 'maskrcnn', 'gpt3', 'dlrm_dcnv2', 'bert', 'rnnt', 'unet3d', 'gnn','llama2_70b_lora'] "
+    POST:  " enqueue_config('training_4.1.0/closed_{}.yaml'.format(v['value'])) "
+
+- KEY:
+    NAME: gradient_accumulation_steps
+    REQ: EXACTLY_ONE
+    CHECK: " v['value'] > 0 "
diff --git a/mlperf_logging/compliance_checker/training_4.1.0/closed_dlrm_dcnv2.yaml b/mlperf_logging/compliance_checker/training_4.1.0/closed_dlrm_dcnv2.yaml
@@ -0,0 +1,59 @@
+- KEY:
+    NAME:  global_batch_size
+    REQ:   EXACTLY_ONE
+
+- KEY:
+    NAME:  opt_name
+    REQ:   EXACTLY_ONE
+    CHECK: " v['value'] == 'adagrad' "
+
+- KEY:
+    NAME:  opt_base_learning_rate
+    REQ:   EXACTLY_ONE
+
+- KEY:
+    NAME:  opt_adagrad_learning_rate_decay
+    REQ:   EXACTLY_ONE
+    CHECK: " v['value'] == 0 "
+
+- KEY:
+    NAME:  opt_weight_decay
+    REQ:   EXACTLY_ONE
+    CHECK: " v['value'] == 0 "
+
+- KEY:
+    NAME:  opt_adagrad_initial_accumulator_value
+    REQ:   EXACTLY_ONE
+    CHECK: " v['value'] == 0 "
+
+- KEY:
+    NAME:  opt_adagrad_epsilon
+    REQ:   EXACTLY_ONE
+    CHECK: " v['value'] == 1e-8 "
+
+- KEY:
+    NAME:  opt_learning_rate_warmup_steps
+    REQ:   EXACTLY_ONE
+    CHECK: " v['value'] == 0 "
+
+- KEY:
+    NAME:  opt_learning_rate_decay_start_step
+    REQ:   EXACTLY_ONE
+    CHECK: " v['value'] == 0 "
+
+- KEY:
+    NAME:  opt_learning_rate_decay_steps
+    REQ:   EXACTLY_ONE
+    CHECK: " v['value'] == 0 "
+
+- KEY:
+    NAME:  eval_accuracy
+    REQ:   AT_LEAST_ONE
+    CHECK:
+        - "'epoch_num' in v['metadata']"
+    ATLEAST_ONE_CHECK: "v['value'] >= 0.80275 and v['value'] <= 1.0"
+
+- KEY:
+    NAME:  eval_samples
+    REQ:   EXACTLY_ONE
+    CHECK: " v['value'] == 89137319 "
diff --git a/mlperf_logging/compliance_checker/training_4.1.0/closed_gnn.yaml b/mlperf_logging/compliance_checker/training_4.1.0/closed_gnn.yaml
@@ -0,0 +1,21 @@
+- KEY:
+    NAME:  global_batch_size
+    REQ:   EXACTLY_ONE
+    CHECK: " v['value'] > 0"
+
+- KEY:
+    NAME:  opt_name
+    REQ:   EXACTLY_ONE
+    CHECK: " v['value'] == 'adam' "
+
+- KEY:
+    NAME:  opt_base_learning_rate
+    REQ:   EXACTLY_ONE
+    CHECK: " v['value'] >= 0.0"
+
+- KEY:
+    NAME:  eval_accuracy
+    REQ:   AT_LEAST_ONE
+    CHECK:
+        - "'epoch_num' in v['metadata']"
+    ATLEAST_ONE_CHECK: "v['value'] >= 0.72 and v['value'] < 1.0"