add test case; fix imports for tests

neuralmagic · Aug 30, 2024 · 21d2337 · 21d2337
1 parent 4da163b
commit 21d2337
Show file tree

Hide file tree

Showing 3 changed files with 7 additions and 7 deletions.
diff --git a/tests/weight_loading/models.txt b/tests/weight_loading/models.txt
@@ -15,6 +15,7 @@ compressed-tensors, nm-testing/Phi-3-mini-128k-instruct-FP8, main
 compressed-tensors, neuralmagic/Phi-3-medium-128k-instruct-quantized.w4a16, main
 compressed-tensors, nm-testing/Mixtral-8x7B-Instruct-v0.1-W4A16-quantized, main
 compressed-tensors, nm-testing/Mixtral-8x7B-Instruct-v0.1-W4A16-channel-quantized, main
+compressed-tensors, nm-testing/Mixtral-8x7B-Instruct-v0.1-W8A16-quantized, main
 awq, casperhansen/mixtral-instruct-awq, main
 awq_marlin, casperhansen/mixtral-instruct-awq, main
 fp8, neuralmagic/Meta-Llama-3-8B-Instruct-FP8-KV, main

diff --git a/vllm/model_executor/layers/fused_moe/__init__.py b/vllm/model_executor/layers/fused_moe/__init__.py
@@ -1,5 +1,3 @@
-from vllm.model_executor.layers.fused_moe.fused_moe_marlin import (
-    fused_moe_marlin, single_moe_marlin)
 from vllm.model_executor.layers.fused_moe.layer import (
     FusedMoE, FusedMoEMethodBase, FusedMoeWeightScaleSupported)
 from vllm.triton_utils import HAS_TRITON
@@ -8,16 +6,18 @@
     "FusedMoE",
     "FusedMoEMethodBase",
     "FusedMoeWeightScaleSupported",
-    "fused_moe_marlin",
-    "single_moe_marlin",
 ]
 
 if HAS_TRITON:
     from vllm.model_executor.layers.fused_moe.fused_moe import (
         fused_experts, fused_moe, fused_topk, get_config_file_name,
         grouped_topk)
+    from vllm.model_executor.layers.fused_moe.fused_moe_marlin import (
+        fused_moe_marlin, single_moe_marlin)
 
     __all__ += [
+        "fused_moe_marlin",
+        "single_moe_marlin",
         "fused_moe",
         "fused_topk",
         "fused_experts",

diff --git a/vllm/model_executor/layers/fused_moe/fused_moe_marlin.py b/vllm/model_executor/layers/fused_moe/fused_moe_marlin.py
@@ -5,11 +5,10 @@
 import torch
 
 from vllm import _custom_ops as ops
+from vllm.model_executor.layers.fused_moe.fused_moe import (
+    fused_topk, moe_align_block_size, try_get_optimal_moe_config)
 from vllm.scalar_type import scalar_types
 
-from .fused_moe import (fused_topk, moe_align_block_size,
-                        try_get_optimal_moe_config)
-
 
 def single_moe_marlin(
     hidden_states: torch.Tensor,