Load model in the target export precision by default in PTQ (#10267)

* Load model in the target export precision by default Signed-off-by: Jan Lasek <janek.lasek@gmail.com> * Enable megatron_amp_O2=true to actually use half-precision Signed-off-by: Jan Lasek <jlasek@nvidia.com> --------- Signed-off-by: Jan Lasek <janek.lasek@gmail.com> Signed-off-by: Jan Lasek <jlasek@nvidia.com>
NVIDIA · Aug 27, 2024 · 0d1e460 · 0d1e460
1 parent f1f145a
commit 0d1e460
Showing 1 changed file with 4 additions and 2 deletions.
diff --git a/examples/nlp/language_modeling/conf/megatron_gpt_ptq.yaml b/examples/nlp/language_modeling/conf/megatron_gpt_ptq.yaml
@@ -17,13 +17,15 @@ trainer:
   num_nodes: 1
   accelerator: gpu
   logger: false # logger provided by exp_manager
-  precision: bf16 # 16, 32, or bf16
+  precision: ${export.dtype} # 16, bf16, or 32
   enable_checkpointing: false
 
 model:
   tensor_model_parallel_size: 1
   pipeline_model_parallel_size: 1
   restore_from_path: llama2-7b-fp16.nemo # Nemo file path
+  precision: ${export.dtype} # Model weights data type
+  megatron_amp_O2: true # Enable Megatron O2-style half-precision
 
   ## Activation Checkpoint
   activations_checkpoint_granularity: null # 'selective' or 'full'
@@ -42,7 +44,7 @@ export:
   decoder_type: llama # gptnext, gpt2, llama
   inference_tensor_parallel: 1 # Default using 1 TP for inference
   inference_pipeline_parallel: 1 # Default using 1 PP for inference
-  dtype: ${trainer.precision} # Default precision data type
+  dtype: 16 # Default precision data type for non-quantized layers: 16 or bf16
   save_path: llama2-7b-${quantization.algorithm}.qnemo # Path where the quantized model will be saved
   compress: false # Whether save_path should be a tarball or a directory
   sample_output: true # Whether to run a sample prompt before saving