From 0acb46cc9782870687acc7d7ac26a538b6e72ba2 Mon Sep 17 00:00:00 2001
From: Sergey Lyalin <sergey.lyalin@intel.com>
Date: Thu, 4 Apr 2024 10:32:18 +0000
Subject: [PATCH 1/3] Disable weight compression on optimum-intel path if model
 is being converted on-the-fly (not from IR).

---
 vllm/model_executor/openvino_model_loader.py | 1 +
 1 file changed, 1 insertion(+)
diff --git a/vllm/model_executor/openvino_model_loader.py b/vllm/model_executor/openvino_model_loader.py
index be48bcdb2bca5..049da003ad183 100644
--- a/vllm/model_executor/openvino_model_loader.py
+++ b/vllm/model_executor/openvino_model_loader.py
@@ -603,6 +603,7 @@ def get_model(model_config: ModelConfig,
             model_config.model,
             export=export,
             compile=False,
+            load_in_8bit=False,
             trust_remote_code=model_config.trust_remote_code
         )
         patch_stateful_model(pt_model.model, kv_cache_dtype, device_config.device.type == "cpu")

From 0bb4a52001fde76ef0e076588f061c5b663a2ade Mon Sep 17 00:00:00 2001
From: Sergey Lyalin <sergey.lyalin@intel.com>
Date: Mon, 15 Apr 2024 14:09:55 +0000
Subject: [PATCH 2/3] Enable int8 weight compression via env var

---
 vllm/model_executor/openvino_model_loader.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/vllm/model_executor/openvino_model_loader.py b/vllm/model_executor/openvino_model_loader.py
index 049da003ad183..15081d584bbad 100644
--- a/vllm/model_executor/openvino_model_loader.py
+++ b/vllm/model_executor/openvino_model_loader.py
@@ -599,11 +599,12 @@ def get_model(model_config: ModelConfig,
         else:
             print(f'[ INFO ] OpenVINO IR is avaialble for provided model id {model_config.model}. '
                   'This IR will be used for inference as-is, all possible options that may affect model conversion are ignored.')
+        load_in_8bit = None if os.environ.get('VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS', '0') == '1' else False
         pt_model = OVModelForCausalLM.from_pretrained(
             model_config.model,
             export=export,
             compile=False,
-            load_in_8bit=False,
+            load_in_8bit=load_in_8bit,
             trust_remote_code=model_config.trust_remote_code
         )
         patch_stateful_model(pt_model.model, kv_cache_dtype, device_config.device.type == "cpu")

From 02a108a93e96d26e98b861c2b91df64d73bc3310 Mon Sep 17 00:00:00 2001
From: Sergey Lyalin <sergey.lyalin@intel.com>
Date: Tue, 16 Apr 2024 08:58:30 +0000
Subject: [PATCH 3/3] Describe weights comression option in the documentation

---
 use_with_openvino.md | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/use_with_openvino.md b/use_with_openvino.md
index 24113c1938859..697e23fa5201e 100644
--- a/use_with_openvino.md
+++ b/use_with_openvino.md
@@ -52,7 +52,7 @@ python3 benchmark_serving.py --backend openai --endpoint /v1/completions --port
 ```
 
 
-## Use vLLM offline 
+## Use vLLM offline
 
 _All below steps assume you are in `vllm` root directory._
 
@@ -82,3 +82,11 @@ docker run --rm -it --entrypoint python3 -v $HOME/.cache/huggingface:/root/.cach
 # --num-prompts <number of requests to send> (default: 1000)
 # --swap-space <GiB for KV cache> (default: 50)
 ```
+
+## Use Int-8 Weights Compression
+
+Weights int-8 compression is disabled by default. For better performance and lesser memory consumption, the weights compression can be enabled by setting the environment variable `VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS=1`.
+To pass the variable in docker, use `-e VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS=1` as an additional argument to `docker run` command in the examples above.
+
+The variable enables weights compression logic described in [optimum-intel 8-bit weights quantization](https://huggingface.co/docs/optimum/intel/optimization_ov#8-bit).
+Hence, even if the variable is enabled, the compression is applied only for models starting with a certain size and avoids compression of too small models due to a significant accuracy drop.