Skip to content

Commit

Permalink
Update vllm arguments
Browse files Browse the repository at this point in the history
  • Loading branch information
xusenlin committed Aug 22, 2023
1 parent 9c6fc00 commit abaf4f2
Show file tree
Hide file tree
Showing 3 changed files with 39 additions and 11 deletions.
36 changes: 25 additions & 11 deletions api/config.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import os

from loguru import logger
import dotenv

dotenv.load_dotenv()
Expand All @@ -11,29 +11,39 @@
'MODEL_NAME': '',
'MODEL_PATH': '',
'ADAPTER_MODEL_PATH': '',

'DEVICE': 'cuda',
'DEVICE_MAP': "",
'GPUS': '',
'NUM_GPUs': 1,
'QUANTIZE': 16,

'EMBEDDING_NAME': '',
'CONTEXT_LEN': '',
'EMBEDDING_SIZE': '',
'EMBEDDING_DEVICE': 'cuda',

'QUANTIZE': 16,
'LOAD_IN_8BIT': 'False',
'LOAD_IN_4BIT': 'False',
'USING_PTUNING_V2': 'False',

'CONTEXT_LEN': '',
'STREAM_INTERVERL': 2,
'PROMPT_NAME': '',

'PATCH_TYPE': '',
'TRAINING_LENGTH': 4096,
'WINDOW_SIZE': 512,

'API_PREFIX': '/v1',

'USE_VLLM': 'False',
'TRUST_REMOTE_CODE': "False",
'TOKENIZE_MODE': "auto",
'TENSOR_PARALLEL_SIZE': 1,
'DTYPE': "half",
'EMBEDDING_SIZE': '',
'EMBEDDING_DEVICE': 'cuda',
"GPU_MEMORY_UTILIZATION": 0.9,
"MAX_NUM_BATCHED_TOKENS": 5120,
"MAX_NUM_SEQS": 256,
}


Expand Down Expand Up @@ -61,15 +71,19 @@ def __init__(self):
self.GPUS = get_env('GPUS')
self.NUM_GPUs = int(get_env('NUM_GPUs'))

self.QUANTIZE = int(get_env('QUANTIZE'))
self.EMBEDDING_NAME = get_env('EMBEDDING_NAME') if get_env('EMBEDDING_NAME') else None
self.CONTEXT_LEN = int(get_env('CONTEXT_LEN')) if get_env('CONTEXT_LEN') else None
self.EMBEDDING_SIZE = int(get_env('EMBEDDING_SIZE')) if get_env('EMBEDDING_SIZE') else None
self.EMBEDDING_DEVICE = get_env('EMBEDDING_DEVICE')

self.QUANTIZE = int(get_env('QUANTIZE'))
self.LOAD_IN_8BIT = get_bool_env('LOAD_IN_8BIT')
self.LOAD_IN_4BIT = get_bool_env('LOAD_IN_4BIT')
self.USING_PTUNING_V2 = get_bool_env('USING_PTUNING_V2')

self.CONTEXT_LEN = int(get_env('CONTEXT_LEN')) if get_env('CONTEXT_LEN') else None
self.STREAM_INTERVERL = int(get_env('STREAM_INTERVERL'))
self.PROMPT_NAME = get_env('PROMPT_NAME') if get_env('PROMPT_NAME') else None

self.PATCH_TYPE = get_env('PATCH_TYPE') if get_env('PATCH_TYPE') else None
self.TRAINING_LENGTH = int(get_env('TRAINING_LENGTH'))
self.WINDOW_SIZE = int(get_env('WINDOW_SIZE'))
Expand All @@ -81,13 +95,13 @@ def __init__(self):
self.TOKENIZE_MODE = get_env('TOKENIZE_MODE')
self.TENSOR_PARALLEL_SIZE = int(get_env('TENSOR_PARALLEL_SIZE'))
self.DTYPE = get_env('DTYPE')

self.EMBEDDING_SIZE = int(get_env('EMBEDDING_SIZE')) if get_env('EMBEDDING_SIZE') else None
self.EMBEDDING_DEVICE = get_env('EMBEDDING_DEVICE')
self.GPU_MEMORY_UTILIZATION = float(get_env('GPU_MEMORY_UTILIZATION'))
self.MAX_NUM_BATCHED_TOKENS = int(get_env('MAX_NUM_BATCHED_TOKENS'))
self.MAX_NUM_SEQS = int(get_env('MAX_NUM_SEQS'))


config = Config()
print(f"Config: {config.__dict__}")
logger.debug(f"Config: {config.__dict__}")
if config.GPUS:
if len(config.GPUS.split(",")) < config.NUM_GPUs:
raise ValueError(
Expand Down
4 changes: 4 additions & 0 deletions api/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,9 @@ def get_vllm_engine():
trust_remote_code=config.TRUST_REMOTE_CODE,
dtype=config.DTYPE,
tensor_parallel_size=config.TENSOR_PARALLEL_SIZE,
gpu_memory_utilization=config.GPU_MEMORY_UTILIZATION,
max_num_batched_tokens=config.MAX_NUM_BATCHED_TOKENS,
max_num_seqs=config.MAX_NUM_SEQS,
)
engine = AsyncLLMEngine.from_engine_args(engine_args)

Expand All @@ -91,6 +94,7 @@ def get_vllm_engine():
)

engine_model_config = asyncio.run(engine.get_model_config())
engine.engine.scheduler_config.max_model_len = get_context_len(engine_model_config)
engine.max_model_len = get_context_len(engine_model_config)

return engine
Expand Down
10 changes: 10 additions & 0 deletions docs/VLLM_SCRIPT.md
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ pip uninstall transformer-engine -y

### 环境变量含义


+ `MODEL_NAME`: 模型名称,如 `qwen``baichuan-13b-chat`


Expand All @@ -44,6 +45,15 @@ pip uninstall transformer-engine -y
+ `EMBEDDING_NAME`(可选项): 嵌入模型的文件所在路径,推荐使用 `moka-ai/m3e-base` 或者 `BAAI/bge-large-zh`


+ `GPU_MEMORY_UTILIZATION`(可选项): `GPU` 占用率


+ `MAX_NUM_BATCHED_TOKENS`(可选项): 每个批处理的最大 `token` 数量


+ `MAX_NUM_SEQS`(可选项): 批量大小


### 启动方式

选择下面两种方式之一启动模型接口服务
Expand Down

0 comments on commit abaf4f2

Please sign in to comment.