Fix several bugs & add Bert inc example to pipeline (#1492)

## Describe your changes Fix Olive bugs & Add Bert inc examples to example pipeline. - `batch_size` is needed for Inc dataloader. Set default size to 1. - Custom eval func doesn't have batch_size as input. - For `QuantizationAwareTraining` pass, `train_data_config` is not required if user provides `training_loop_func`. - Latest transformers package will automatically save trained model as safetensors format. Add `save_safetensors` as false to train argument. - Some passes may have nested data_config in its config. Update auto-fill data_config logic to achieve this. ## Checklist before requesting a review - [ ] Add unit tests for this change. - [ ] Make sure all tests can pass. - [ ] Update documents if necessary. - [ ] Lint and apply fixes to your code by running `lintrunner -a` - [ ] Is this a user-facing change? If yes, give a description of this change to be included in the release notes. - [ ] Is this PR including examples changes? If yes, please remember to update [example documentation](https://github.com/microsoft/Olive/blob/main/docs/source/examples.md) in a follow-up PR. ## (Optional) Issue link
microsoft · Nov 18, 2024 · f47265f · f47265f
1 parent 40dc359
commit f47265f
Show file tree

Hide file tree

Showing 7 changed files with 63 additions and 9 deletions.
diff --git a/.azure_pipelines/olive-examples.yaml b/.azure_pipelines/olive-examples.yaml
@@ -25,6 +25,9 @@ jobs:
       bert_ptq_cpu_docker:
         exampleFolder: bert
         exampleName: bert_ptq_cpu_docker
+      bert_inc:
+        exampleFolder: bert
+        exampleName: bert_inc
       resnet_vitis_ai_ptq_cpu:
         exampleFolder: resnet
         exampleName: resnet_vitis_ai_ptq_cpu
@@ -56,6 +59,9 @@ jobs:
     onnxruntime: onnxruntime
     python_version: '3.10'
     examples:
+      bert_inc:
+        exampleFolder: bert
+        exampleName: bert_inc
       bert_ptq_cpu_docker:
         exampleFolder: bert
         exampleName: bert_ptq_cpu_docker

diff --git a/examples/bert/bert_inc_smoothquant_ptq_cpu.json b/examples/bert/bert_inc_smoothquant_ptq_cpu.json
@@ -49,7 +49,6 @@
         "quantization": {
             "type": "IncStaticQuantization",
             "quant_format": "QOperator",
-            "user_script": "user_script.py",
             "data_config": "inc_quat_data_config",
             "recipes": { "smooth_quant": true, "smooth_quant_args": { "alpha": 0.7 } },
             "metric": {
@@ -73,5 +72,5 @@
     },
     "evaluator": "common_evaluator",
     "cache_dir": "cache",
-    "output_dir": "models/bert_inc_static_ptq_cpu"
+    "output_dir": "models/bert_inc_smoothquant_ptq_cpu"
 }
diff --git a/examples/bert/requirements.txt b/examples/bert/requirements.txt
@@ -1,10 +1,12 @@
 azure-ai-ml
 azure-identity
-datasets
+# TODO(anyone): load_metrics was removed since 3.0.0. Using evaluate instead
+datasets<3.0.0
 docker>=7.1.0
 evaluate
 neural-compressor
 optimum
+pytorch_lightning
 scikit-learn
 scipy
 tabulate

diff --git a/examples/bert/user_script.py b/examples/bert/user_script.py
@@ -167,7 +167,7 @@ def bert_inc_glue_calibration_dataset(data_dir, **kwargs):
 
 
 @Registry.register_dataloader()
-def bert_inc_glue_calibration_dataloader(dataset, batch_size, **kwargs):
+def bert_inc_glue_calibration_dataloader(dataset, batch_size=1, **kwargs):
     return DefaultDataLoader(dataset=dataset, batch_size=batch_size)
 
 
@@ -176,7 +176,7 @@ def bert_inc_glue_calibration_dataloader(dataset, batch_size, **kwargs):
 # -------------------------------------------------------------------------
 
 
-def eval_accuracy(model: OliveModelHandler, device, execution_providers, batch_size, **kwargs):
+def eval_accuracy(model: OliveModelHandler, device, execution_providers, batch_size=1, **kwargs):
     dataset = bert_dataset("Intel/bert-base-uncased-mrpc")
     dataloader = bert_dataloader(dataset, batch_size)
     preds = []
@@ -240,6 +240,7 @@ def training_loop_func(model):
     training_args.save_strategy = "steps"
     training_args.save_total_limit = 1
     training_args.metric_for_best_model = "accuracy"
+    training_args.save_safetensors = False
 
     dataset = BertDataset("Intel/bert-base-uncased-mrpc")
 

diff --git a/examples/test/local/test_bert_inc.py b/examples/test/local/test_bert_inc.py
@@ -0,0 +1,35 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
+import json
+import os
+
+import pytest
+
+from ..utils import check_output, get_example_dir
+
+
+@pytest.fixture(scope="module", autouse=True)
+def setup():
+    """Setups any state specific to the execution of the given module."""
+    os.chdir(get_example_dir("bert"))
+
+
+@pytest.mark.parametrize(
+    "olive_json",
+    [
+        "bert_inc_dynamic_ptq_cpu.json",
+        "bert_inc_ptq_cpu.json",
+        "bert_inc_smoothquant_ptq_cpu.json",
+        "bert_inc_static_ptq_cpu.json",
+    ],
+)
+def test_bert(olive_json):
+    from olive.workflows import run as olive_run
+
+    with open(olive_json) as f:
+        olive_config = json.load(f)
+
+    footprint = olive_run(olive_config, tempdir=os.environ.get("OLIVE_TEMPDIR", None))
+    check_output(footprint)
diff --git a/olive/passes/pytorch/quantization_aware_training.py b/olive/passes/pytorch/quantization_aware_training.py
@@ -30,7 +30,6 @@ def _default_config(cls, accelerator_spec: AcceleratorSpec) -> Dict[str, PassCon
             **get_user_script_data_config(),
             "train_data_config": PassConfigParam(
                 type_=Union[DataConfig, Dict],
-                required=True,
                 description="Data config for training.",
             ),
             "val_data_config": PassConfigParam(
@@ -102,7 +101,8 @@ def _run_for_config(
         if Path(output_model_path).suffix != ".pt":
             output_model_path += ".pt"
 
-        qat_trainer_config.train_data_config = validate_config(config["train_data_config"], DataConfig)
+        if config["train_data_config"]:
+            qat_trainer_config.train_data_config = validate_config(config["train_data_config"], DataConfig)
         if config["val_data_config"]:
             qat_trainer_config.val_data_config = validate_config(config["val_data_config"], DataConfig)
         if config["training_loop_func"]:

diff --git a/olive/workflows/run/config.py b/olive/workflows/run/config.py
@@ -256,8 +256,8 @@ def validate_pass_search(cls, v, values):
         for param_name in v["config"]:
             if v["config"][param_name] == PassParamDefault.SEARCHABLE_VALUES:
                 searchable_configs.add(param_name)
-            if param_name.endswith("data_config"):
-                v["config"] = _resolve_data_config(v["config"], values, param_name)
+
+        resolve_all_data_configs(v["config"], values)
 
         if not values["engine"].search_strategy and searchable_configs:
             raise ValueError(
@@ -274,6 +274,17 @@ def validate_workflow_host(cls, v, values):
         return _resolve_config(values, v)
 
 
+def resolve_all_data_configs(config, values):
+    """Recursively traverse the config dictionary to resolve all 'data_config' keys."""
+    for param_name, param_value in config.items():
+        if param_name.endswith("data_config"):
+            _resolve_data_config(config, values, param_name)
+            continue
+
+        if isinstance(param_value, dict):
+            resolve_all_data_configs(param_value, values)
+
+
 def _insert_azureml_client(config, azureml_client):
     """Insert azureml_client into config recursively.