From 0acb46cc9782870687acc7d7ac26a538b6e72ba2 Mon Sep 17 00:00:00 2001 From: Sergey Lyalin Date: Thu, 4 Apr 2024 10:32:18 +0000 Subject: [PATCH 1/3] Disable weight compression on optimum-intel path if model is being converted on-the-fly (not from IR). --- vllm/model_executor/openvino_model_loader.py | 1 + 1 file changed, 1 insertion(+) diff --git a/vllm/model_executor/openvino_model_loader.py b/vllm/model_executor/openvino_model_loader.py index be48bcdb2bca5..049da003ad183 100644 --- a/vllm/model_executor/openvino_model_loader.py +++ b/vllm/model_executor/openvino_model_loader.py @@ -603,6 +603,7 @@ def get_model(model_config: ModelConfig, model_config.model, export=export, compile=False, + load_in_8bit=False, trust_remote_code=model_config.trust_remote_code ) patch_stateful_model(pt_model.model, kv_cache_dtype, device_config.device.type == "cpu") From 0bb4a52001fde76ef0e076588f061c5b663a2ade Mon Sep 17 00:00:00 2001 From: Sergey Lyalin Date: Mon, 15 Apr 2024 14:09:55 +0000 Subject: [PATCH 2/3] Enable int8 weight compression via env var --- vllm/model_executor/openvino_model_loader.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vllm/model_executor/openvino_model_loader.py b/vllm/model_executor/openvino_model_loader.py index 049da003ad183..15081d584bbad 100644 --- a/vllm/model_executor/openvino_model_loader.py +++ b/vllm/model_executor/openvino_model_loader.py @@ -599,11 +599,12 @@ def get_model(model_config: ModelConfig, else: print(f'[ INFO ] OpenVINO IR is avaialble for provided model id {model_config.model}. ' 'This IR will be used for inference as-is, all possible options that may affect model conversion are ignored.') + load_in_8bit = None if os.environ.get('VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS', '0') == '1' else False pt_model = OVModelForCausalLM.from_pretrained( model_config.model, export=export, compile=False, - load_in_8bit=False, + load_in_8bit=load_in_8bit, trust_remote_code=model_config.trust_remote_code ) patch_stateful_model(pt_model.model, kv_cache_dtype, device_config.device.type == "cpu") From 02a108a93e96d26e98b861c2b91df64d73bc3310 Mon Sep 17 00:00:00 2001 From: Sergey Lyalin Date: Tue, 16 Apr 2024 08:58:30 +0000 Subject: [PATCH 3/3] Describe weights comression option in the documentation --- use_with_openvino.md | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/use_with_openvino.md b/use_with_openvino.md index 24113c1938859..697e23fa5201e 100644 --- a/use_with_openvino.md +++ b/use_with_openvino.md @@ -52,7 +52,7 @@ python3 benchmark_serving.py --backend openai --endpoint /v1/completions --port ``` -## Use vLLM offline +## Use vLLM offline _All below steps assume you are in `vllm` root directory._ @@ -82,3 +82,11 @@ docker run --rm -it --entrypoint python3 -v $HOME/.cache/huggingface:/root/.cach # --num-prompts (default: 1000) # --swap-space (default: 50) ``` + +## Use Int-8 Weights Compression + +Weights int-8 compression is disabled by default. For better performance and lesser memory consumption, the weights compression can be enabled by setting the environment variable `VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS=1`. +To pass the variable in docker, use `-e VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS=1` as an additional argument to `docker run` command in the examples above. + +The variable enables weights compression logic described in [optimum-intel 8-bit weights quantization](https://huggingface.co/docs/optimum/intel/optimization_ov#8-bit). +Hence, even if the variable is enabled, the compression is applied only for models starting with a certain size and avoids compression of too small models due to a significant accuracy drop.