modifications to address PR changes

foundation-model-stack · Aug 15, 2024 · 0fe0867 · 0fe0867
1 parent 94c499f
commit 0fe0867
Show file tree

Hide file tree

Showing 5 changed files with 59 additions and 30 deletions.
diff --git a/scripts/benchmarks/benchmark.py b/scripts/benchmarks/benchmark.py
@@ -165,6 +165,7 @@ def __init__(
         input_field: str = "input",
         dataset_text_field: str = "output",
         chat_template: str = None,
+        response_template: str = None,
         additional_dataset_kwargs: Dict = {},
     ) -> None:
 
@@ -183,7 +184,7 @@ def __init__(
         }
         self.training_paths = {}  # cache to store the training paths
         self.data_save_path = data_save_path
-        self.is_pretokenized = False
+        self.response_template = response_template
 
     def prepare_dataset(
         self,
@@ -193,6 +194,14 @@ def prepare_dataset(
         if model_name in self.training_paths:
             return self.training_paths[model_name]
 
+        if self.response_template:
+            if response_template is not None:
+                warnings.warn(
+                    "Response Template detected in data processing field, "
+                    "overriding response template."
+                )
+            response_template = self.response_template
+
         if self.kwargs["tokenize"]:
             tokenizer = AutoTokenizer.from_pretrained(model_name)
 
@@ -227,8 +236,6 @@ def prepare_dataset(
 
         # call the map
         ds = self.dataset_split.map(format_fn, **kwargs)
-        # set an attribute to indicate dataset has already been tokenized
-        self.is_pretokenized = 'input_ids' in ds.column_names and 'labels' in ds.column_names
 
         # save it
         ds.to_json(save_path)
@@ -266,8 +273,8 @@ def convert_keyvalue_arguments_to_list(args_dict: Dict):
             # otherwise if a regular argument
             if val is None:
                 warnings.warn(
-                    f"Argument '{arg}' is not a true/false argument andhad a 'None' value "\
-                    "and thus will be ignored.",
+                    f"Argument '{arg}' is not a true/false argument and "
+                    "had a 'None' value and thus will be ignored.",
                 )
                 continue
 
@@ -680,7 +687,7 @@ def prepare_arguments(args, benchmark_dataset: BenchmarkDataset):
         # scenario-specific constants should overwrite any similar values in defaults
         defaults = {k:v for k, v in defaults.items() if k not in scenario_constants}
         # update defaults with scenario constants
-        constants = {**scenario_constants, **defaults}
+        constants = {**defaults, **scenario_constants}
         # Remove any empty variables and combine matrices to dictionary to cartesian product on
         combined_matrices = {**scenario_matrices, **experiment_matrices}
         products = ConfigUtils.cartesian_product_on_dict(combined_matrices)
@@ -695,20 +702,8 @@ def prepare_arguments(args, benchmark_dataset: BenchmarkDataset):
             # prepare the dataset
             training_path = benchmark_dataset.prepare_dataset(
                 x["model_name_or_path"],
-                (
-                    x[HF_ARG_RESPONSE_TEMPLATE]
-                    if HF_ARG_RESPONSE_TEMPLATE in x
-                    else constants.get(HF_ARG_RESPONSE_TEMPLATE)
-                ),
+                constants.get(HF_ARG_RESPONSE_TEMPLATE),
             )
-            # Check to remove all template arguments if dataset is pretokenized at this stage
-            if benchmark_dataset.is_pretokenized:
-                # if `prepare_dataset` has formatted and tokenized the dataset,
-                # Update the following args to ensure SFTTrainer
-                # recognizes it as a pretokenized dataset
-                constants[HF_ARG_RESPONSE_TEMPLATE] = None
-                constants[HF_ARG_DATASET_TEXT_FIELD] = None
-                x["packing"] = False
 
             # update
             x[HF_ARG_TRAINING_DATA_PATH] = training_path

diff --git a/scripts/benchmarks/compare_with_reference.py b/scripts/benchmarks/compare_with_reference.py
@@ -172,9 +172,19 @@ def main(
         help="the acceptable relative difference from the reference value.",
     )
 
-    parser.add_argument("--indices", default=DEFAULT_INDICES, nargs="+")
+    parser.add_argument(
+        "--indices",
+        default=DEFAULT_INDICES,
+        nargs="+",
+        help="list of column names to use as index for merging between old and new benchmark results",
+    )
 
-    parser.add_argument("--plot_columns", default=DEFAULT_PLOT_COLUMNS, nargs="+")
+    parser.add_argument(
+        "--plot_columns", 
+        default=DEFAULT_PLOT_COLUMNS, 
+        nargs="+"
+        help="list of metric names in benchmark results to analyze visually",
+    )
 
     args = parser.parse_args()
     main(

diff --git a/scripts/benchmarks/scenarios-pretok.yaml b/scripts/benchmarks/scenarios-pretok.yaml
@@ -36,6 +36,7 @@ data_processing:
         {%- endfor %}
     dataset_split: "train[:2000]"
     tokenize: True
+    response_template: "\n### Response:"
 
 # scenarios
 scenarios:
@@ -45,8 +46,12 @@ scenarios:
             torch_dtype: float16
             gradient_accumulation_steps: 2
             max_steps: null
+            packing: False
             model_name_or_path: 
                 - 'mistralai/Mistral-7B-v0.1'
+            response_template: null
+            dataset_text_field: null
+
 
     -   name: padding-free
         framework_config: 
@@ -56,13 +61,16 @@ scenarios:
             torch_dtype: float16
             gradient_accumulation_steps: 2
             max_steps: null
-            #remove_unused_columns: False # used only for refactor
+            packing: False
             model_name_or_path: 
                 - 'mistralai/Mistral-7B-v0.1'
+            response_template: null
+            dataset_text_field: null
 
     -   name: accelerated-peft-bnb
         framework_config: 
             - accelerated-peft-bnb
+            - accelerated-peft-bnb-padding-free
             - accelerated-peft-bnb-foak
             - accelerated-peft-bnb-foak-padding-free
         arguments:
@@ -76,12 +84,16 @@ scenarios:
             target_modules: ["q_proj", "k_proj", "v_proj", "o_proj"]
             max_steps: null
             gradient_accumulation_steps: 2
+            packing: False
             model_name_or_path: 
                 - 'mistralai/Mistral-7B-v0.1'
+            response_template: null
+            dataset_text_field: null
 
     -   name: accelerated-peft-gptq
         framework_config: 
             - accelerated-peft-autogptq
+            - accelerated-peft-autogptq-padding-free
             - accelerated-peft-autogptq-foak
             - accelerated-peft-autogptq-foak-padding-free
         arguments:
@@ -95,5 +107,8 @@ scenarios:
             target_modules: ["q_proj", "k_proj", "v_proj", "o_proj"]
             max_steps: null
             gradient_accumulation_steps: 2
+            packing: False
             model_name_or_path: 
                 - 'TheBloke/Mistral-7B-v0.1-GPTQ'
+            response_template: null
+            dataset_text_field: null
diff --git a/scripts/run_benchmarks.sh b/scripts/run_benchmarks.sh
@@ -36,7 +36,6 @@ PIP_REQUIREMENTS_FILE=requirements.txt
 # ------------- DROP COLUMNS FRO RESULTS -----------------
 # env inputs
 DRY_RUN=${DRY_RUN:-"false"}
-NO_COMPARE=${NO_COMPARE:-"false"}
 NO_DATA_PROCESSING=${NO_DATA_PROCESSING:-"false"}
 NO_OVERWRITE=${NO_OVERWRITE:-"false"}
 MEMORY_LOGGING=${MEMORY_LOGGING:-"all"}
@@ -46,6 +45,7 @@ NUM_GPUS_MATRIX=${1-"1 2"}
 RESULT_DIR=${2:-"benchmark_outputs"}
 SCENARIOS_CONFIG=${3:-$SCENARIOS_CONFIG}
 SCENARIOS_FILTER=${4-$SCNTAG_PEFT_AUTOGPTQ}
+DEFAULTS_CONFIG=${5:-$DEFAULTS_CONFIG}
 
 echo "NUM_GPUS_MATRIX: $NUM_GPUS_MATRIX"
 echo "RESULT_DIR: $RESULT_DIR"
@@ -139,9 +139,18 @@ PYTHONPATH=. \
         'error_messages' \
         'acceleration_framework_config_file'
 
-if [ "$DRY_RUN" = "true" ]; then 
-    echo "DRY_RUN=True, will skip compare with reference logic"    
-elif [ "$NO_COMPARE" = "false" ]; then
-    PYTHONPATH=. \
-        python $WORKING_DIR/compare_with_reference.py --result_dir $RESULT_DIR
-fi
+
+# For every new benchmark run, it is good practice to perform a regression check
+# against a previous known set of benchmark results. This repo provides a convenient comparison
+# tool that analyses the differences of metrics like loss and throughput between an old and new set 
+# of benchmark results.
+# To use this tool simply run the following python command
+# PYTHONPATH=. \
+#     python $WORKING_DIR/compare_with_reference.py
+# The following arguments can be used to further configure the analysis, otherwise it uses default values
+#   arguments:
+#   --result_dir <Output directory to save comparison artifacts>
+#   --reference_benchmark_filepath <filepath of the old benchmark results to compare againts>
+#   --threshold_ratio <to define an acceptable difference between old and new results>
+#   --indices <defines the set of column names used as unique identifier to merge the 2 sets of results>
+#   --plot_columns <specifies the metric name to be compared and vizualized>
diff --git a/tox.ini b/tox.ini
@@ -41,7 +41,7 @@ commands =
     python -m fms_acceleration.cli install -e {toxinidir}/plugins/attention_and_distributed_packing
 
     # run the benchmark script
-    bash scripts/run_benchmarks.sh {posargs:"1 2" benchmark_outputs "scenarios-pretok.yaml" "none"}
+    bash scripts/run_benchmarks.sh {posargs:"1 2" benchmark_outputs}
 
 allowlist_externals = bash