Add INT4 ONNX whisper example (#450)

* Update README.md
intel · Oct 12, 2023 · c7f8173 · c7f8173
1 parent e9fc4c2
commit c7f8173
Show file tree

Hide file tree

Showing 4 changed files with 35 additions and 12 deletions.
diff --git a/examples/huggingface/onnxruntime/speech-recognition/quantization/README.md b/examples/huggingface/onnxruntime/speech-recognition/quantization/README.md
@@ -26,14 +26,24 @@ bash run_tuning.sh --config=openai/whisper-large \
                    --approach=static # or dynamic
 ```
 
+- To get int4 model
+
+```
+bash run_tuning.sh --config=openai/whisper-large \
+                   --dataset_location=/path/to/dataset \ # optional
+                   --input_model=whisper-large-with-past/ \
+                   --output_model=whisper-large-onnx-int4/ \
+                   --approach=weight_only
+```
+
 ## 2. Benchmark
 - To get model accuracy
 
 ```
 bash run_benchmark.sh --config=whisper-large-with-past \
                       --dataset_location=/path/to/dataset \ # optional
                       --input_model=whisper-large-with-past-static/ \
-                      --int8 \
+                      --int8 \ # or int4
                       --mode=accuracy
 ```
 
@@ -46,7 +56,7 @@ numactl -m 0 -C 0-3 bash run_benchmark.sh --config=whisper-large-with-past \
                                           --mode=benchmark \
                                           --iters=100 \
                                           --cores_per_instance=4 \
-                                          --int8 \
+                                          --int8 \ # or int4
                                           --max_new_tokens=16
 ```
 
@@ -86,8 +96,11 @@ Available INT4 models on huggingface:
 
 # Validated model list
 
-|Topology|Pretrained model|PostTrainingDynamic|PostTrainingStatic
-|---|------------------------------------|---|---
-|whisper_large|openai/whisper-large| ✅| ✅|
-
-
+|Topology|Pretrained model|PostTrainingDynamic|PostTrainingStatic|WeightOnly4Bit|
+|---|------------------------------------|---|---|---
+|whisper_tiny|openai/whisper-tiny| | | ✅|
+|whisper_base|openai/whisper-base| | | ✅|
+|whisper_small|openai/whisper-small| | | ✅|
+|whisper_medium|openai/whisper-medium| | | ✅|
+|whisper_large|openai/whisper-large| | | ✅|
+|whisper_large_v2|openai/whisper-large-v2| ✅| ✅| ✅|
diff --git a/examples/huggingface/onnxruntime/speech-recognition/quantization/requirements.txt b/examples/huggingface/onnxruntime/speech-recognition/quantization/requirements.txt
@@ -4,7 +4,7 @@ transformers
 jiwer
 optimum
 onnx
-onnxruntime
+onnxruntime==1.16.0
 evaluate
 neural-compressor
 librosa

diff --git a/examples/huggingface/onnxruntime/speech-recognition/quantization/run_benchmark.sh b/examples/huggingface/onnxruntime/speech-recognition/quantization/run_benchmark.sh
@@ -40,6 +40,9 @@ function init_params {
       --int8=*)
           int8=$(echo ${var} |cut -f2 -d=)
       ;;
+      --int4=*)
+          int4=$(echo ${var} |cut -f2 -d=)
+      ;;
     esac
   done
 
@@ -49,7 +52,7 @@ function init_params {
 # run_benchmark
 function run_benchmark {
 
-    if [[ ${int8} == "false" ]]; then
+    if [[ ${int8} == "false" && ${int4} == "false"]]; then
         input_model=${config}
     fi
 

diff --git a/examples/huggingface/onnxruntime/speech-recognition/quantization/run_whisper.py b/examples/huggingface/onnxruntime/speech-recognition/quantization/run_whisper.py
@@ -30,7 +30,7 @@
 parser.add_argument('--input_model', default=None, type=str,
                     help='the folder path to fp32 models')
 parser.add_argument('--approach', default='dynamic', type=str,
-                    help='the quantization approach to use')
+                    help='the quantization approach to use, support static, dynamic and weight_only')
 parser.add_argument('--model_name_or_path', default=None, type=str)
 parser.add_argument('--cores_per_instance', default=4, type=int,
                     help='cores per instance during benchmark')
@@ -174,13 +174,20 @@ def audiosegment_to_librosawav(audiosegment):
                                            calib_dataloader=dataloader
                                            )
                 q_model.save(os.path.join(args.output_model, model))
-        else:
+        elif args.approach == 'dynamic':
             conf = PostTrainingQuantConfig(approach="dynamic",
                     op_type_dict={'^((?!(MatMul|Gather|Conv)).)*$': {'weight': {'dtype': ['fp32']}, 'activation': {'dtype': ['fp32']}}},)
             for model in model_list:
                 q_model = quantization.fit(os.path.join(args.input_model, model),
                                            conf=conf)
                 q_model.save(os.path.join(args.output_model, model))
+        else:
+            conf = PostTrainingQuantConfig(approach="weight_only",
+                    op_type_dict={'.*': {'weight': {'algorithm': ['RTN'], 'scheme': ['asym']}}},)
+            for model in model_list:
+                q_model = quantization.fit(os.path.join(args.input_model, model),
+                                           conf=conf)
+                q_model.save(os.path.join(args.output_model, model))
 
     if args.accuracy_only:
         eval(args.input_model)
@@ -198,7 +205,7 @@ def audiosegment_to_librosawav(audiosegment):
                 session_options=sess_options)
         model = ORTModelForSpeechSeq2Seq(sessions[0], sessions[1], config, args.input_model, sessions[2])
         processor = WhisperProcessor.from_pretrained(args.model_name_or_path)
-      
+
         if args.audio_test:
             from pydub import AudioSegment