diff --git a/engines/python/setup/djl_python/properties_manager/properties.py b/engines/python/setup/djl_python/properties_manager/properties.py index 2b17c84db..85ed64f79 100644 --- a/engines/python/setup/djl_python/properties_manager/properties.py +++ b/engines/python/setup/djl_python/properties_manager/properties.py @@ -61,9 +61,9 @@ class Properties(BaseModel): input_formatter: Optional[Callable] = None waiting_steps: Optional[int] = None mpi_mode: bool = False - tgi_compat: Optional[bool] = False - bedrock_compat: Optional[bool] = False - enable_lora: Optional[bool] = False + tgi_compat: bool = False + bedrock_compat: bool = False + enable_lora: bool = False # Spec_dec draft_model_id: Optional[str] = None diff --git a/engines/python/setup/djl_python/properties_manager/vllm_rb_properties.py b/engines/python/setup/djl_python/properties_manager/vllm_rb_properties.py index ab8b64fd4..e972c9fb3 100644 --- a/engines/python/setup/djl_python/properties_manager/vllm_rb_properties.py +++ b/engines/python/setup/djl_python/properties_manager/vllm_rb_properties.py @@ -11,11 +11,12 @@ # BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied. See the License for # the specific language governing permissions and limitations under the License. import ast -import dataclasses -from typing import Optional, Any, Mapping, Tuple, Dict - -from pydantic import field_validator, model_validator, ConfigDict +import logging +from typing import Optional, Any, Dict, Tuple +from pydantic import field_validator, model_validator, ConfigDict, Field from vllm import EngineArgs +from vllm.utils import FlexibleArgumentParser +from vllm.engine.arg_utils import StoreBoolean from djl_python.properties_manager.properties import Properties @@ -29,17 +30,19 @@ class VllmRbProperties(Properties): engine: Optional[str] = None - # The following configs have different names in DJL compared to vLLM - quantize: Optional[str] = None + # The following configs have different names in DJL compared to vLLM, we only accept DJL name currently tensor_parallel_degree: int = 1 pipeline_parallel_degree: int = 1 - max_rolling_batch_prefill_tokens: Optional[int] = None - cpu_offload_gb_per_gpu: Optional[int] = 0 + # The following configs have different names in DJL compared to vLLM, either is accepted + quantize: Optional[str] = Field(alias="quantization", default=None) + max_rolling_batch_prefill_tokens: Optional[int] = Field( + alias="max_num_batched_tokens", default=None) + cpu_offload_gb_per_gpu: Optional[float] = Field(alias="cpu_offload_gb", + default=None) # The following configs have different defaults, or additional processing in DJL compared to vLLM dtype: str = "auto" - max_loras: Optional[int] = 4 + max_loras: int = 4 long_lora_scaling_factors: Optional[Tuple[float, ...]] = None - limit_mm_per_prompt: Optional[Mapping[str, int]] = None # Neuron vLLM properties device: Optional[str] = None @@ -56,7 +59,17 @@ def validate_engine(cls, engine): f"Need python engine to start vLLM RollingBatcher") return engine + @model_validator(mode='after') + def validate_pipeline_parallel(self): + if self.pipeline_parallel_degree != 1: + raise ValueError( + "Pipeline parallelism is not supported in vLLM's LLMEngine used in rolling_batch implementation" + ) + return self + @field_validator('long_lora_scaling_factors', mode='before') + # TODO: processing of this field is broken in vllm via from_cli_args + # we should upstream a fix for this to vllm def validate_long_lora_scaling_factors(cls, val): if isinstance(val, str): val = ast.literal_eval(val) @@ -73,112 +86,91 @@ def validate_long_lora_scaling_factors(cls, val): ) return val - @field_validator('limit_mm_per_prompt', mode="before") - def validate_limit_mm_per_prompt(cls, val) -> Mapping[str, int]: - out_dict: Dict[str, int] = {} - for item in val.split(","): - kv_parts = [part.lower().strip() for part in item.split("=")] - if len(kv_parts) != 2: - raise ValueError("Each item should be in the form key=value") - key, value = kv_parts - - try: - parsed_value = int(value) - except ValueError as e: - raise ValueError( - f"Failed to parse value of item {key}={value}") from e - - if key in out_dict and out_dict[key] != parsed_value: - raise ValueError( - f"Conflicting values specified for key: {key}") - out_dict[key] = parsed_value - return out_dict - - @model_validator(mode='after') - def validate_pipeline_parallel(self): - if self.pipeline_parallel_degree != 1: - raise ValueError( - "Pipeline parallelism is not supported in vLLM's LLMEngine used in rolling_batch implementation" - ) - return self - def handle_lmi_vllm_config_conflicts(self, additional_vllm_engine_args): - def djl_config_conflicts_with_vllm_config(lmi_config_name, - vllm_config_name) -> bool: - # TODO: We may be able to refactor this to throw the ValueError directly from this method. - # The errors are slightly different depending on the specific configs, so for now we keep - # the exception separate in favor of better, more specific client errors + def validate_potential_lmi_vllm_config_conflict( + lmi_config_name, vllm_config_name): lmi_config_val = self.__getattribute__(lmi_config_name) vllm_config_val = additional_vllm_engine_args.get(vllm_config_name) if vllm_config_val is not None and lmi_config_val is not None: - return lmi_config_val != vllm_config_val - return False - - if djl_config_conflicts_with_vllm_config("quantize", "quantization"): - raise ValueError( - "Both the DJL quantize config, and vllm quantization configs have been set with conflicting values." - "Only set the DJL quantize config") - if djl_config_conflicts_with_vllm_config("tensor_parallel_degree", - "tensor_parallel_size"): - raise ValueError( - "Both the DJL tensor_parallel_degree and vllm tensor_parallel_size configs have been set with conflicting values." - "Only set the DJL tensor_parallel_degree config") - if djl_config_conflicts_with_vllm_config("pipeline_parallel_degree", - "pipeline_parallel_size"): - raise ValueError( - "Both the DJL pipeline_parallel_degree and vllm pipeline_parallel_size configs have been set with conflicting values." - "Only set the DJL pipeline_parallel_degree config") - if djl_config_conflicts_with_vllm_config( - "max_rolling_batch_prefill_tokens", "max_num_batched_tokens"): - raise ValueError( - "Both the DJL max_rolling_batch_prefill_tokens and vllm max_num_batched_tokens configs have been set with conflicting values." - "Only set one of these configurations") - if djl_config_conflicts_with_vllm_config("cpu_offload_gb_per_gpu", - "cpu_offload_gb"): - raise ValueError( - "Both the DJL cpu_offload_gb_per_gpu and vllm cpu_offload_gb configs have been set with conflicting values." - "Only set one of these configurations") + if vllm_config_val != lmi_config_val: + raise ValueError( + f"Both the DJL {lmi_config_val}={lmi_config_val} and vLLM {vllm_config_name}={vllm_config_val} configs have been set with conflicting values." + f"We currently only accept the DJL config {lmi_config_val}, please remove the vllm {vllm_config_name} configuration." + ) + + validate_potential_lmi_vllm_config_conflict("tensor_parallel_degree", + "tensor_parallel_size") + validate_potential_lmi_vllm_config_conflict("pipeline_parallel_degree", + "pipeline_parallel_size") + validate_potential_lmi_vllm_config_conflict("max_rolling_batch_size", + "max_num_seqs") + + def generate_vllm_engine_arg_dict(self, + passthrough_vllm_engine_args) -> dict: + vllm_engine_args = { + 'model': self.model_id_or_path, + 'tensor_parallel_size': self.tensor_parallel_degree, + 'pipeline_parallel_size': self.pipeline_parallel_degree, + 'max_num_seqs': self.max_rolling_batch_size, + 'dtype': DTYPE_MAPPER[self.dtype], + 'revision': self.revision, + 'max_loras': self.max_loras, + 'enable_lora': self.enable_lora, + } + if self.quantize is not None: + vllm_engine_args['quantization'] = self.quantize + if self.max_rolling_batch_prefill_tokens is not None: + vllm_engine_args[ + 'max_num_batched_tokens'] = self.max_rolling_batch_prefill_tokens + if self.cpu_offload_gb_per_gpu is not None: + vllm_engine_args['cpu_offload_gb'] = self.cpu_offload_gb_per_gpu + if self.device is not None: + vllm_engine_args['device'] = self.device + if self.preloaded_model is not None: + vllm_engine_args['preloaded_model'] = self.preloaded_model + if self.generation_config is not None: + vllm_engine_args['generation_config'] = self.generation_config + vllm_engine_args.update(passthrough_vllm_engine_args) + return vllm_engine_args def get_engine_args(self) -> EngineArgs: additional_vllm_engine_args = self.get_additional_vllm_engine_args() self.handle_lmi_vllm_config_conflicts(additional_vllm_engine_args) - max_model_len = additional_vllm_engine_args.pop("max_model_len", None) - if self.device == 'neuron': - return EngineArgs( - model=self.model_id_or_path, - preloaded_model=self.preloaded_model, - tensor_parallel_size=self.tensor_parallel_degree, - pipeline_parallel_size=self.pipeline_parallel_degree, - dtype=DTYPE_MAPPER[self.dtype], - max_num_seqs=self.max_rolling_batch_size, - block_size=max_model_len, - max_model_len=max_model_len, - trust_remote_code=self.trust_remote_code, - revision=self.revision, - device=self.device, - generation_config=self.generation_config, - **additional_vllm_engine_args, - ) - return EngineArgs( - model=self.model_id_or_path, - tensor_parallel_size=self.tensor_parallel_degree, - pipeline_parallel_size=self.pipeline_parallel_degree, - dtype=DTYPE_MAPPER[self.dtype], - max_model_len=max_model_len, - quantization=self.quantize, - max_num_batched_tokens=self.max_rolling_batch_prefill_tokens, - max_loras=self.max_loras, - long_lora_scaling_factors=self.long_lora_scaling_factors, - cpu_offload_gb=self.cpu_offload_gb_per_gpu, - limit_mm_per_prompt=self.limit_mm_per_prompt, - **additional_vllm_engine_args, + vllm_engine_arg_dict = self.generate_vllm_engine_arg_dict( + additional_vllm_engine_args) + logging.debug( + f"Construction vLLM engine args from the following DJL configs: {vllm_engine_arg_dict}" ) + parser = EngineArgs.add_cli_args(FlexibleArgumentParser()) + args_list = self.construct_vllm_args_list(vllm_engine_arg_dict, parser) + args = parser.parse_args(args=args_list) + engine_args = EngineArgs.from_cli_args(args) + # we have to do this separately because vllm converts it into a string + engine_args.long_lora_scaling_factors = self.long_lora_scaling_factors + return engine_args def get_additional_vllm_engine_args(self) -> Dict[str, Any]: - all_engine_args = EngineArgs.__annotations__ return { - arg: val - for arg, val in self.__pydantic_extra__.items() - if arg in all_engine_args + k: v + for k, v in self.__pydantic_extra__.items() + if k in EngineArgs.__annotations__ + } + + def construct_vllm_args_list(self, vllm_engine_args: dict, + parser: FlexibleArgumentParser): + # Modified from https://github.com/vllm-project/vllm/blob/v0.6.4/vllm/utils.py#L1258 + args_list = [] + store_boolean_arguments = { + action.dest + for action in parser._actions if isinstance(action, StoreBoolean) } + for engine_arg, engine_arg_value in vllm_engine_args.items(): + if str(engine_arg_value).lower() in { + 'true', 'false' + } and engine_arg not in store_boolean_arguments: + if str(engine_arg_value).lower() == 'true': + args_list.append(f"--{engine_arg}") + else: + args_list.append(f"--{engine_arg}={engine_arg_value}") + return args_list diff --git a/engines/python/setup/djl_python/rolling_batch/vllm_rolling_batch.py b/engines/python/setup/djl_python/rolling_batch/vllm_rolling_batch.py index 25ee2da4b..d49b753bd 100644 --- a/engines/python/setup/djl_python/rolling_batch/vllm_rolling_batch.py +++ b/engines/python/setup/djl_python/rolling_batch/vllm_rolling_batch.py @@ -52,7 +52,7 @@ def __init__(self, model_id_or_path: str, properties: dict, self.request_cache = OrderedDict() self.lora_id_counter = AtomicCounter(0) self.lora_requests = {} - self.is_mistral_tokenizer = self.vllm_configs.tokenizer_mode == 'mistral' + self.is_mistral_tokenizer = args.tokenizer_mode == 'mistral' def get_tokenizer(self): return self.engine.tokenizer.tokenizer diff --git a/engines/python/setup/djl_python/tests/test_properties_manager.py b/engines/python/setup/djl_python/tests/test_properties_manager.py index 262b0c676..5647bf88f 100644 --- a/engines/python/setup/djl_python/tests/test_properties_manager.py +++ b/engines/python/setup/djl_python/tests/test_properties_manager.py @@ -3,6 +3,8 @@ import unittest from unittest import mock +from vllm import EngineArgs + from djl_python.properties_manager.properties import Properties from djl_python.properties_manager.tnx_properties import ( TransformerNeuronXProperties, TnXGenerationStrategy, TnXModelSchema, @@ -422,70 +424,92 @@ def test_hf_error_case(self, params): def test_vllm_properties(self): # test with valid vllm properties - - def test_vllm_valid(properties): - vllm_configs = VllmRbProperties(**properties) + def validate_vllm_config_and_engine_args_match( + vllm_config_value, + engine_arg_value, + expected_value, + ): + self.assertEqual(vllm_config_value, expected_value) + self.assertEqual(engine_arg_value, expected_value) + + def test_vllm_default_properties(): + required_properties = { + "engine": "Python", + "model_id_or_path": "some_model", + } + vllm_configs = VllmRbProperties(**required_properties) engine_args = vllm_configs.get_engine_args() - self.assertEqual(vllm_configs.model_id_or_path, engine_args.model) - self.assertEqual(vllm_configs.max_rolling_batch_prefill_tokens, - engine_args.max_num_batched_tokens) - self.assertEqual(vllm_configs.tensor_parallel_degree, - engine_args.tensor_parallel_size) - self.assertEqual(vllm_configs.pipeline_parallel_degree, - engine_args.pipeline_parallel_size) - self.assertEqual(vllm_configs.quantize, engine_args.quantization) - self.assertEqual(DTYPE_MAPPER[vllm_configs.dtype], - engine_args.dtype) - self.assertEqual(vllm_configs.cpu_offload_gb_per_gpu, - engine_args.cpu_offload_gb) - - def test_long_lora_scaling_factors(properties): - properties['long_lora_scaling_factors'] = "3.0" + validate_vllm_config_and_engine_args_match( + vllm_configs.model_id_or_path, engine_args.model, "some_model") + validate_vllm_config_and_engine_args_match( + vllm_configs.tensor_parallel_degree, + engine_args.tensor_parallel_size, 1) + validate_vllm_config_and_engine_args_match( + vllm_configs.pipeline_parallel_degree, + engine_args.pipeline_parallel_size, 1) + validate_vllm_config_and_engine_args_match( + vllm_configs.quantize, engine_args.quantization, None) + validate_vllm_config_and_engine_args_match( + vllm_configs.max_rolling_batch_size, engine_args.max_num_seqs, + HuggingFaceProperties.max_rolling_batch_size) + validate_vllm_config_and_engine_args_match(vllm_configs.dtype, + engine_args.dtype, + 'auto') + validate_vllm_config_and_engine_args_match(vllm_configs.max_loras, + engine_args.max_loras, + 4) + self.assertEqual(vllm_configs.cpu_offload_gb_per_gpu, None) + self.assertEqual( + len(vllm_configs.get_additional_vllm_engine_args()), 0) + + def test_long_lora_scaling_factors(): + properties = { + "engine": "Python", + "model_id_or_path": "some_model", + 'long_lora_scaling_factors': "3.0" + } vllm_props = VllmRbProperties(**properties) - self.assertEqual(vllm_props.long_lora_scaling_factors, (3.0, )) + engine_args = vllm_props.get_engine_args() + self.assertEqual(engine_args.long_lora_scaling_factors, (3.0, )) properties['long_lora_scaling_factors'] = "3" vllm_props = VllmRbProperties(**properties) - self.assertEqual(vllm_props.long_lora_scaling_factors, (3.0, )) + engine_args = vllm_props.get_engine_args() + self.assertEqual(engine_args.long_lora_scaling_factors, (3.0, )) properties['long_lora_scaling_factors'] = "3.0,4.0" vllm_props = VllmRbProperties(**properties) - self.assertEqual(vllm_props.long_lora_scaling_factors, (3.0, 4.0)) + engine_args = vllm_props.get_engine_args() + self.assertEqual(engine_args.long_lora_scaling_factors, (3.0, 4.0)) properties['long_lora_scaling_factors'] = "3.0, 4.0 " vllm_props = VllmRbProperties(**properties) - self.assertEqual(vllm_props.long_lora_scaling_factors, (3.0, 4.0)) + engine_args = vllm_props.get_engine_args() + self.assertEqual(engine_args.long_lora_scaling_factors, (3.0, 4.0)) properties['long_lora_scaling_factors'] = "(3.0,)" vllm_props = VllmRbProperties(**properties) - self.assertEqual(vllm_props.long_lora_scaling_factors, (3.0, )) + engine_args = vllm_props.get_engine_args() + self.assertEqual(engine_args.long_lora_scaling_factors, (3.0, )) properties['long_lora_scaling_factors'] = "(3.0,4.0)" vllm_props = VllmRbProperties(**properties) - self.assertEqual(vllm_props.long_lora_scaling_factors, (3.0, 4.0)) + engine_args = vllm_props.get_engine_args() + self.assertEqual(engine_args.long_lora_scaling_factors, (3.0, 4.0)) - def test_invalid_long_lora_scaling_factors(properties): - properties['long_lora_scaling_factors'] = "a,b" + def test_invalid_long_lora_scaling_factors(): + properties = { + "engine": "Python", + "model_id_or_path": "some_model", + 'long_lora_scaling_factors': "a,b" + } + vllm_props = VllmRbProperties(**properties) with self.assertRaises(ValueError): - VllmRbProperties(**properties) + vllm_props.get_engine_args() - properties = { - 'model_id': 'sample_model_id', - 'engine': 'Python', - 'max_rolling_batch_prefill_tokens': '12500', - 'max_model_len': '12800', - 'tensor_parallel_degree': '2', - 'dtype': 'fp16', - 'quantize': 'awq', - 'enforce_eager': "True", - 'enable_lora': "true", - "gpu_memory_utilization": "0.85", - 'load_format': 'pt', - 'cpu_offload_gb_per_gpu': '3', - } - test_vllm_valid(properties.copy()) - test_long_lora_scaling_factors(properties.copy()) - test_invalid_long_lora_scaling_factors(properties.copy()) + test_vllm_default_properties() + test_long_lora_scaling_factors() + test_invalid_long_lora_scaling_factors() def test_sd_inf2_properties(self): properties = { diff --git a/engines/python/setup/djl_python_engine.py b/engines/python/setup/djl_python_engine.py index f11c9fa2d..e06270f82 100644 --- a/engines/python/setup/djl_python_engine.py +++ b/engines/python/setup/djl_python_engine.py @@ -189,7 +189,7 @@ def main(): # noinspection PyBroadException try: - args = ArgParser.python_engine_args().parse_args() + args = ArgParser.python_engine_args().parse_args(args=sys.argv[1:]) logging.basicConfig(stream=sys.stdout, format="%(levelname)s::%(message)s", level=args.log_level.upper())