Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Track] progress in removing vLLM dependencies #2245

Open
2 tasks
zhyncs opened this issue Nov 28, 2024 · 3 comments
Open
2 tasks

[Track] progress in removing vLLM dependencies #2245

zhyncs opened this issue Nov 28, 2024 · 3 comments
Assignees

Comments

@zhyncs
Copy link
Member

zhyncs commented Nov 28, 2024

Checklist

Motivation

ref #1673

Create a separate issue to follow up on the two currently important PRs:

weight loader: #2220 @HandH1998
distributed: #2244 @yizhang2077

Related resources

No response

@zhyncs
Copy link
Member Author

zhyncs commented Dec 2, 2024

ref #2318

@zhyncs
Copy link
Member Author

zhyncs commented Dec 2, 2024

TLDR
Currently, aside from quantification, the main things left are distributed and rope.
cc @HandH1998 @yizhang2077 @james-p-xu @ispobock @merrymercy @Ying1123

current main

distributed/parallel_state.py
1261:    import vllm.distributed.parallel_state as vllm_parrlel_state

models/mllama.py
11:import vllm.distributed.parallel_state as ps

utils.py
497:    import vllm.distributed.device_communicators.custom_all_reduce_utils as tgt
layers/logits_processor.py
21:from vllm.distributed import (

layers/custom_op_util.py
15:from vllm.model_executor.custom_op import CustomOp

layers/layernorm.py
32:from vllm.model_executor.custom_op import CustomOp
128:    from vllm.model_executor.layers.layernorm import GemmaRMSNorm, RMSNorm

layers/linear.py
10:from vllm.distributed import (
20:from vllm.model_executor.layers.linear import LinearBase
21:from vllm.model_executor.parameter import (

layers/vocab_parallel_embedding.py
9:from vllm.distributed import (
15:from vllm.model_executor.parameter import BasevLLMParameter

layers/fused_moe_triton/fused_moe.py
14:from vllm import _custom_ops as ops

layers/fused_moe_triton/layer.py
8:from vllm.distributed import (
13:from vllm.model_executor.custom_op import CustomOp

layers/activation.py
28:from vllm.distributed import (
33:from vllm.model_executor.custom_op import CustomOp
156:    from vllm.model_executor.layers.activation import GeluAndMul, SiluAndMul

layers/quantization/__init__.py
6:from vllm.model_executor.layers.quantization.aqlm import AQLMConfig
7:from vllm.model_executor.layers.quantization.awq import AWQConfig
8:from vllm.model_executor.layers.quantization.awq_marlin import AWQMarlinConfig
9:from vllm.model_executor.layers.quantization.bitsandbytes import BitsAndBytesConfig
10:from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors import (
13:from vllm.model_executor.layers.quantization.deepspeedfp import DeepSpeedFPConfig
14:from vllm.model_executor.layers.quantization.experts_int8 import ExpertsInt8Config
15:from vllm.model_executor.layers.quantization.fbgemm_fp8 import FBGEMMFp8Config
16:from vllm.model_executor.layers.quantization.fp8 import Fp8Config, Fp8MoEMethod
17:from vllm.model_executor.layers.quantization.gguf import GGUFConfig
18:from vllm.model_executor.layers.quantization.gptq import GPTQConfig
19:from vllm.model_executor.layers.quantization.gptq_marlin import GPTQMarlinConfig
20:from vllm.model_executor.layers.quantization.gptq_marlin_24 import GPTQMarlin24Config
21:from vllm.model_executor.layers.quantization.marlin import MarlinConfig
22:from vllm.model_executor.layers.quantization.qqq import QQQConfig
23:from vllm.model_executor.layers.quantization.tpu_int8 import Int8TpuConfig
102:    from vllm.model_executor.layers.linear import LinearBase
103:    from vllm.model_executor.layers.quantization.fp8 import Fp8LinearMethod
104:    from vllm.model_executor.layers.quantization.utils.quant_utils import (

model_executor/cuda_graph_runner.py
23:from vllm.distributed.parallel_state import graph_capture
24:from vllm.model_executor.custom_op import CustomOp

model_executor/model_runner.py
23:from vllm.distributed import (
29:from vllm.distributed.parallel_state import in_the_same_node_as

models/qwen2_vl.py
33:from vllm.distributed import parallel_state
34:from vllm.distributed import utils as dist_utils
35:from vllm.logger import init_logger
36:from vllm.model_executor.layers.activation import QuickGELU

models/dbrx.py
22:from vllm.distributed import (
27:from vllm.model_executor.layers.rotary_embedding import get_rope
28:from vllm.transformers_utils.configs.dbrx import DbrxConfig

models/gpt2.py
25:from vllm.distributed.parallel_state import get_tensor_model_parallel_world_size
26:from vllm.model_executor.layers.activation import get_act_fn
27:from vllm.model_executor.layers.vocab_parallel_embedding import VocabParallelEmbedding

models/olmo2.py
24:from vllm.distributed import (
30:from vllm.model_executor.layers.rotary_embedding import get_rope
31:from vllm.model_executor.model_loader.weight_utils import default_weight_loader

models/chatglm.py
24:from vllm.distributed import get_tensor_model_parallel_world_size
25:from vllm.model_executor.layers.rotary_embedding import get_rope
26:from vllm.transformers_utils.configs import ChatGLMConfig

models/grok.py
25:from vllm.distributed import get_tensor_model_parallel_world_size
26:from vllm.model_executor.layers.rotary_embedding import get_rope

models/internlm2.py
22:from vllm.distributed import get_tensor_model_parallel_world_size
23:from vllm.model_executor.layers.rotary_embedding import get_rope

models/llama.py
25:from vllm.distributed import get_tensor_model_parallel_world_size
26:from vllm.model_executor.layers.rotary_embedding import get_rope

models/olmoe.py
26:from vllm.distributed import (
30:from vllm.model_executor.layers.linear import (
36:from vllm.model_executor.layers.rotary_embedding import get_rope

models/xverse.py
24:from vllm.distributed import get_tensor_model_parallel_world_size
25:from vllm.model_executor.layers.activation import SiluAndMul
26:from vllm.model_executor.layers.layernorm import RMSNorm
27:from vllm.model_executor.layers.linear import (
32:from vllm.model_executor.layers.rotary_embedding import get_rope

models/minicpm.py
21:from vllm.distributed import get_tensor_model_parallel_world_size
22:from vllm.model_executor.layers.rotary_embedding import get_rope

models/deepseek.py
24:from vllm.distributed import (
29:from vllm.model_executor.layers.rotary_embedding import get_rope

models/mllama.py
17:from vllm.distributed import get_tensor_model_parallel_world_size

models/stablelm.py
27:from vllm.distributed import get_tensor_model_parallel_world_size
28:from vllm.model_executor.layers.rotary_embedding import get_rope

models/gemma2.py
23:from vllm.distributed import get_tensor_model_parallel_world_size
48:from vllm.model_executor.layers.rotary_embedding import RotaryEmbedding
146:        # from vLLM: TODO(woosuk): Use the `get_rope` interface.

models/qwen.py
23:from vllm.distributed import get_tensor_model_parallel_world_size
24:from vllm.model_executor.layers.rotary_embedding import get_rope

models/mixtral_quant.py
26:from vllm.distributed import (
31:from vllm.model_executor.layers.rotary_embedding import get_rope

models/minicpm3.py
22:from vllm.distributed import get_tensor_model_parallel_world_size
23:from vllm.model_executor.layers.linear import (
29:from vllm.model_executor.layers.rotary_embedding import get_rope

models/gpt_bigcode.py
24:from vllm.distributed import get_tensor_model_parallel_world_size

models/commandr.py
47:from vllm.distributed import (
51:from vllm.model_executor.layers.rotary_embedding import get_rope

models/torch_native_llama.py
50:from vllm.distributed import (
54:from vllm.model_executor.layers.rotary_embedding import get_rope

models/phi3_small.py
8:from vllm.distributed import get_tensor_model_parallel_world_size
9:from vllm.model_executor.layers.rotary_embedding import get_rope

models/xverse_moe.py
21:from vllm.distributed import (
26:from vllm.model_executor.layers.activation import SiluAndMul
27:from vllm.model_executor.layers.layernorm import RMSNorm
28:from vllm.model_executor.layers.linear import (
34:from vllm.model_executor.layers.rotary_embedding import get_rope

models/gemma.py
24:from vllm.distributed import get_tensor_model_parallel_world_size
25:from vllm.model_executor.layers.rotary_embedding import get_rope

models/baichuan.py
27:from vllm.distributed import (
31:from vllm.model_executor.layers.linear import (
36:from vllm.model_executor.layers.rotary_embedding import get_rope

models/exaone.py
23:from vllm.distributed import get_tensor_model_parallel_world_size
24:from vllm.model_executor.layers.rotary_embedding import get_rope

models/qwen2.py
23:from vllm.distributed import get_tensor_model_parallel_world_size
24:from vllm.model_executor.layers.rotary_embedding import get_rope

models/deepseek_v2.py
24:from vllm.distributed import (
30:from vllm.model_executor.layers.rotary_embedding import get_rope

models/olmo.py
23:from vllm.distributed import get_tensor_model_parallel_world_size
24:from vllm.model_executor.layers.rotary_embedding import get_rope

models/mixtral.py
24:from vllm.distributed import get_tensor_model_parallel_world_size
25:from vllm.model_executor.layers.rotary_embedding import get_rope

models/qwen2_moe.py
25:from vllm.distributed import (
29:from vllm.model_executor.layers.rotary_embedding import get_rope

utils.py
430:    from vllm.logger import logger as vllm_default_logger
502:    from vllm.distributed.device_communicators.custom_all_reduce import CustomAllreduce
513:    from vllm.distributed.parallel_state import GroupCoordinator
554:    from vllm.model_executor.layers.linear import LinearBase
555:    from vllm.model_executor.layers.quantization.gguf import (
792:    from vllm.distributed import get_tensor_model_parallel_rank

model_loader/weight_utils.py
22:from vllm.distributed import get_tensor_model_parallel_rank

lora/lora.py
30:from vllm.model_executor.layers.vocab_parallel_embedding import (

hf_transformers_utils.py
34:    from vllm.transformers_utils.configs import ChatGLMConfig, DbrxConfig

model_loader/loader.py
24:from vllm.distributed import (
499:        from vllm.distributed import get_tensor_model_parallel_rank
559:        from vllm.distributed import get_tensor_model_parallel_rank

@james-p-xu
Copy link
Contributor

james-p-xu commented Dec 2, 2024

Hi @zhyncs, the RoPE PR (#2016) is still blocked by a version upgrade on flashinfer. Happy to discuss this offline whenever you're available. In addition, #2016 also uses the vllm.model_executor.custom_op, which may need to be removed as a separate change.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Projects
None yet
Development

No branches or pull requests

4 participants