From 0d1e460bc0516e76920aacda2664a0638e9d1577 Mon Sep 17 00:00:00 2001 From: Jan Lasek Date: Tue, 27 Aug 2024 16:31:18 +0200 Subject: [PATCH] Load model in the target export precision by default in PTQ (#10267) * Load model in the target export precision by default Signed-off-by: Jan Lasek * Enable megatron_amp_O2=true to actually use half-precision Signed-off-by: Jan Lasek --------- Signed-off-by: Jan Lasek Signed-off-by: Jan Lasek --- examples/nlp/language_modeling/conf/megatron_gpt_ptq.yaml | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/examples/nlp/language_modeling/conf/megatron_gpt_ptq.yaml b/examples/nlp/language_modeling/conf/megatron_gpt_ptq.yaml index f603ebb58eb7..62f0e452d3b5 100644 --- a/examples/nlp/language_modeling/conf/megatron_gpt_ptq.yaml +++ b/examples/nlp/language_modeling/conf/megatron_gpt_ptq.yaml @@ -17,13 +17,15 @@ trainer: num_nodes: 1 accelerator: gpu logger: false # logger provided by exp_manager - precision: bf16 # 16, 32, or bf16 + precision: ${export.dtype} # 16, bf16, or 32 enable_checkpointing: false model: tensor_model_parallel_size: 1 pipeline_model_parallel_size: 1 restore_from_path: llama2-7b-fp16.nemo # Nemo file path + precision: ${export.dtype} # Model weights data type + megatron_amp_O2: true # Enable Megatron O2-style half-precision ## Activation Checkpoint activations_checkpoint_granularity: null # 'selective' or 'full' @@ -42,7 +44,7 @@ export: decoder_type: llama # gptnext, gpt2, llama inference_tensor_parallel: 1 # Default using 1 TP for inference inference_pipeline_parallel: 1 # Default using 1 PP for inference - dtype: ${trainer.precision} # Default precision data type + dtype: 16 # Default precision data type for non-quantized layers: 16 or bf16 save_path: llama2-7b-${quantization.algorithm}.qnemo # Path where the quantized model will be saved compress: false # Whether save_path should be a tarball or a directory sample_output: true # Whether to run a sample prompt before saving