From f764b11f661d42a637b4dee348efc2489fb92fe2 Mon Sep 17 00:00:00 2001 From: youkaichao Date: Fri, 13 Sep 2024 00:35:02 -0700 Subject: [PATCH 1/4] fix quant test --- tests/quantization/test_bitsandbytes.py | 26 ++++++++++++------------- tests/quantization/utils.py | 4 +--- 2 files changed, 14 insertions(+), 16 deletions(-) diff --git a/tests/quantization/test_bitsandbytes.py b/tests/quantization/test_bitsandbytes.py index 3f0c6cbc051a7..da98471db0419 100644 --- a/tests/quantization/test_bitsandbytes.py +++ b/tests/quantization/test_bitsandbytes.py @@ -77,19 +77,6 @@ def validate_generated_texts(hf_runner, model_name, hf_model_kwargs=None): - if hf_model_kwargs is None: - hf_model_kwargs = {} - - # Run with HF runner - with hf_runner(model_name, model_kwargs=hf_model_kwargs) as llm: - hf_outputs = llm.generate_greedy(prompts, 8) - hf_logs = log_generated_texts(prompts, hf_outputs, "HfRunner") - - # Clean up the GPU memory for the next test - torch.cuda.synchronize() - gc.collect() - torch.cuda.empty_cache() - #Run with vLLM runner with vllm_runner(model_name, quantization='bitsandbytes', @@ -104,6 +91,19 @@ def validate_generated_texts(hf_runner, gc.collect() torch.cuda.empty_cache() + if hf_model_kwargs is None: + hf_model_kwargs = {} + + # Run with HF runner + with hf_runner(model_name, model_kwargs=hf_model_kwargs) as llm: + hf_outputs = llm.generate_greedy(prompts, 8) + hf_logs = log_generated_texts(prompts, hf_outputs, "HfRunner") + + # Clean up the GPU memory for the next test + torch.cuda.synchronize() + gc.collect() + torch.cuda.empty_cache() + # Compare the generated strings for hf_log, vllm_log in zip(hf_logs, vllm_logs): hf_str = hf_log["generated_text"] diff --git a/tests/quantization/utils.py b/tests/quantization/utils.py index 65bb80ed70c6a..5fad06878f4a3 100644 --- a/tests/quantization/utils.py +++ b/tests/quantization/utils.py @@ -1,12 +1,10 @@ -import torch - from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS from vllm.platforms import current_platform def is_quant_method_supported(quant_method: str) -> bool: # Currently, all quantization methods require Nvidia or AMD GPUs - if not torch.cuda.is_available(): + if not (current_platform.is_cuda() or current_platform.is_rocm()): return False capability = current_platform.get_device_capability() From 24ab0657c69a1280d5a6c02732af643cbd97347f Mon Sep 17 00:00:00 2001 From: youkaichao Date: Fri, 13 Sep 2024 00:38:18 -0700 Subject: [PATCH 2/4] add comments --- tests/quantization/test_bitsandbytes.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/quantization/test_bitsandbytes.py b/tests/quantization/test_bitsandbytes.py index da98471db0419..5b918cd9a05b0 100644 --- a/tests/quantization/test_bitsandbytes.py +++ b/tests/quantization/test_bitsandbytes.py @@ -77,6 +77,9 @@ def validate_generated_texts(hf_runner, model_name, hf_model_kwargs=None): + # NOTE: run vLLM first, as it requires a clean process + # when using distributed inference + #Run with vLLM runner with vllm_runner(model_name, quantization='bitsandbytes', From 2784bbd7744e04857d02c8d5c4858d71abeccba2 Mon Sep 17 00:00:00 2001 From: youkaichao Date: Fri, 13 Sep 2024 00:39:07 -0700 Subject: [PATCH 3/4] add fork --- tests/quantization/test_bitsandbytes.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tests/quantization/test_bitsandbytes.py b/tests/quantization/test_bitsandbytes.py index 5b918cd9a05b0..4872d71c89c6f 100644 --- a/tests/quantization/test_bitsandbytes.py +++ b/tests/quantization/test_bitsandbytes.py @@ -9,6 +9,7 @@ import torch from tests.quantization.utils import is_quant_method_supported +from ..utils import fork_new_process_for_each_test models_4bit_to_test = [ ('huggyllama/llama-7b', 'quantize model inflight'), @@ -29,6 +30,7 @@ @pytest.mark.skipif(not is_quant_method_supported("bitsandbytes"), reason='bitsandbytes is not supported on this GPU type.') @pytest.mark.parametrize("model_name, description", models_4bit_to_test) +@fork_new_process_for_each_test def test_load_4bit_bnb_model(hf_runner, vllm_runner, example_prompts, model_name, description) -> None: @@ -41,6 +43,7 @@ def test_load_4bit_bnb_model(hf_runner, vllm_runner, example_prompts, reason='bitsandbytes is not supported on this GPU type.') @pytest.mark.parametrize("model_name, description", models_pre_qaunt_4bit_to_test) +@fork_new_process_for_each_test def test_load_pre_quant_4bit_bnb_model(hf_runner, vllm_runner, example_prompts, model_name, description) -> None: @@ -52,6 +55,7 @@ def test_load_pre_quant_4bit_bnb_model(hf_runner, vllm_runner, example_prompts, reason='bitsandbytes is not supported on this GPU type.') @pytest.mark.parametrize("model_name, description", models_pre_quant_8bit_to_test) +@fork_new_process_for_each_test def test_load_8bit_bnb_model(hf_runner, vllm_runner, example_prompts, model_name, description) -> None: From 9d4361678f85db0d57e8e360bc25bacc2b7365d8 Mon Sep 17 00:00:00 2001 From: youkaichao Date: Fri, 13 Sep 2024 00:39:17 -0700 Subject: [PATCH 4/4] format --- tests/quantization/test_bitsandbytes.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/quantization/test_bitsandbytes.py b/tests/quantization/test_bitsandbytes.py index 4872d71c89c6f..87200b1dcc534 100644 --- a/tests/quantization/test_bitsandbytes.py +++ b/tests/quantization/test_bitsandbytes.py @@ -9,6 +9,7 @@ import torch from tests.quantization.utils import is_quant_method_supported + from ..utils import fork_new_process_for_each_test models_4bit_to_test = [