diff --git a/python/sglang/srt/server_args.py b/python/sglang/srt/server_args.py index 144ade58ea..8b326cb612 100644 --- a/python/sglang/srt/server_args.py +++ b/python/sglang/srt/server_args.py @@ -169,9 +169,11 @@ def __post_init__(self): gpu_mem = get_amdgpu_memory_capacity() else: gpu_mem = get_nvgpu_memory_capacity() + + # If the GPU memory is less than 25GB (like GTX 4090) and the user hasn't manually specified the chunked prefill size, we reduce its default value by a factor of 4. if gpu_mem < 25000: - self.chunked_prefill_size //= 4 # make it 2048 - self.cuda_graph_max_bs = 4 + if self.chunked_prefill_size == 8192: + self.chunked_prefill_size //= 4 # make it 2048 logger.info("Automatically adjust --chunked-prefill-size for small GPUs.") # Choose kernel backends