vllm-project · DarkLight1337 · Sep 13, 2024 · Sep 13, 2024 · Sep 13, 2024 · Sep 13, 2024
@@ -77,19 +77,6 @@ def validate_generated_texts(hf_runner,
                              model_name,
                              hf_model_kwargs=None):
 
-    if hf_model_kwargs is None:
-        hf_model_kwargs = {}
-
-    # Run with HF runner
-    with hf_runner(model_name, model_kwargs=hf_model_kwargs) as llm:
-        hf_outputs = llm.generate_greedy(prompts, 8)
-        hf_logs = log_generated_texts(prompts, hf_outputs, "HfRunner")
-
-    # Clean up the GPU memory for the next test
-    torch.cuda.synchronize()
-    gc.collect()
-    torch.cuda.empty_cache()
-
     #Run with vLLM runner
     with vllm_runner(model_name,
                      quantization='bitsandbytes',
@@ -104,6 +91,19 @@ def validate_generated_texts(hf_runner,
     gc.collect()
     torch.cuda.empty_cache()
 
+    if hf_model_kwargs is None:
+        hf_model_kwargs = {}
+
+    # Run with HF runner
+    with hf_runner(model_name, model_kwargs=hf_model_kwargs) as llm:
+        hf_outputs = llm.generate_greedy(prompts, 8)
+        hf_logs = log_generated_texts(prompts, hf_outputs, "HfRunner")
+
+    # Clean up the GPU memory for the next test
+    torch.cuda.synchronize()
+    gc.collect()
+    torch.cuda.empty_cache()
+
     # Compare the generated strings
     for hf_log, vllm_log in zip(hf_logs, vllm_logs):
         hf_str = hf_log["generated_text"]

@@ -1,12 +1,10 @@
-import torch
-
 from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
 from vllm.platforms import current_platform
 
 
 def is_quant_method_supported(quant_method: str) -> bool:
     # Currently, all quantization methods require Nvidia or AMD GPUs
-    if not torch.cuda.is_available():
+    if not (current_platform.is_cuda() or current_platform.is_rocm()):
         return False
 
     capability = current_platform.get_device_capability()