From 325a06c2deef25067f8b37e73358b4569e13def7 Mon Sep 17 00:00:00 2001 From: Ying Sheng Date: Sun, 28 Jul 2024 23:01:45 -0700 Subject: [PATCH] Fix logging (#796) --- docs/en/hyperparameter_tuning.md | 2 +- python/sglang/lang/backend/openai.py | 2 +- python/sglang/srt/managers/controller/infer_batch.py | 8 ++++++-- .../sglang/srt/managers/controller/manager_multi.py | 2 +- .../sglang/srt/managers/controller/manager_single.py | 2 +- .../sglang/srt/managers/controller/model_runner.py | 12 ++++++------ python/sglang/srt/managers/controller/tp_worker.py | 8 ++++---- python/sglang/srt/model_loader/utils.py | 2 +- 8 files changed, 21 insertions(+), 17 deletions(-) diff --git a/docs/en/hyperparameter_tuning.md b/docs/en/hyperparameter_tuning.md index dec516bc92..85315e7457 100644 --- a/docs/en/hyperparameter_tuning.md +++ b/docs/en/hyperparameter_tuning.md @@ -6,7 +6,7 @@ Achieving a large batch size is the most important thing for attaining high thro When the server is running at full load, look for the following in the log: -```[gpu_id=0] Decode batch. #running-req: 233, #token: 370959, token usage: 0.82, gen throughput (token/s): 4594.01, #queue-req: 417``` +```[gpu=0] Decode batch. #running-req: 233, #token: 370959, token usage: 0.82, gen throughput (token/s): 4594.01, #queue-req: 417``` ### Tune Your Request Submission Speed `#queue-req` indicates the number of requests in the queue. If you frequently see `#queue-req == 0`, it suggests you are bottlenecked by the request submission speed. diff --git a/python/sglang/lang/backend/openai.py b/python/sglang/lang/backend/openai.py index 06701cb371..48dcc080e1 100644 --- a/python/sglang/lang/backend/openai.py +++ b/python/sglang/lang/backend/openai.py @@ -18,7 +18,7 @@ openai = tiktoken = e -logger = logging.getLogger("openai") +logger = logging.getLogger(__name__) def create_logit_bias_int(tokenizer): diff --git a/python/sglang/srt/managers/controller/infer_batch.py b/python/sglang/srt/managers/controller/infer_batch.py index 9f619ed649..a80a9d657e 100644 --- a/python/sglang/srt/managers/controller/infer_batch.py +++ b/python/sglang/srt/managers/controller/infer_batch.py @@ -15,6 +15,7 @@ """Meta data for requests and batches""" +import logging import warnings from dataclasses import dataclass from enum import IntEnum, auto @@ -40,6 +41,9 @@ } +logger = logging.getLogger(__name__) + + class ForwardMode(IntEnum): # Prefill a new sequence. This is deprecated now. "EXTEND" covers this case. PREFILL = auto() @@ -379,7 +383,7 @@ def prepare_for_extend(self, vocab_size: int, int_token_logit_bias: torch.Tensor out_cache_loc = self.token_to_kv_pool.alloc(extend_num_tokens) if out_cache_loc is None: - print("Prefill out of memory. This should never happen.") + logger.error("Prefill out of memory. This should never happen.") self.tree_cache.pretty_print() exit() @@ -613,7 +617,7 @@ def prepare_for_decode(self, input_ids=None): self.out_cache_loc = self.token_to_kv_pool.alloc(bs) if self.out_cache_loc is None: - print("Decode out of memory. This should never happen.") + logger.error("Decode out of memory. This should never happen.") self.tree_cache.pretty_print() exit() diff --git a/python/sglang/srt/managers/controller/manager_multi.py b/python/sglang/srt/managers/controller/manager_multi.py index f228472486..08c9db82b8 100644 --- a/python/sglang/srt/managers/controller/manager_multi.py +++ b/python/sglang/srt/managers/controller/manager_multi.py @@ -39,7 +39,7 @@ from sglang.srt.utils import kill_parent_process from sglang.utils import get_exception_traceback -logger = logging.getLogger("srt.controller") +logger = logging.getLogger(__name__) class LoadBalanceMethod(Enum): diff --git a/python/sglang/srt/managers/controller/manager_single.py b/python/sglang/srt/managers/controller/manager_single.py index 67009dc44b..012d6c1d64 100644 --- a/python/sglang/srt/managers/controller/manager_single.py +++ b/python/sglang/srt/managers/controller/manager_single.py @@ -31,7 +31,7 @@ from sglang.srt.utils import kill_parent_process from sglang.utils import get_exception_traceback -logger = logging.getLogger("srt.controller") +logger = logging.getLogger(__name__) class ControllerSingle: diff --git a/python/sglang/srt/managers/controller/model_runner.py b/python/sglang/srt/managers/controller/model_runner.py index 4ae62dfeee..24c59b6ff8 100644 --- a/python/sglang/srt/managers/controller/model_runner.py +++ b/python/sglang/srt/managers/controller/model_runner.py @@ -57,7 +57,7 @@ monkey_patch_vllm_qvk_linear_loader, ) -logger = logging.getLogger("srt.model_runner") +logger = logging.getLogger(__name__) class ModelRunner: @@ -90,7 +90,7 @@ def __init__( # Init torch distributed torch.cuda.set_device(self.gpu_id) - logger.info(f"[gpu_id={self.gpu_id}] Init nccl begin.") + logger.info(f"[gpu={self.gpu_id}] Init nccl begin.") if not server_args.enable_p2p_check: monkey_patch_vllm_p2p_access_check(self.gpu_id) @@ -130,7 +130,7 @@ def __init__( def load_model(self): logger.info( - f"[gpu_id={self.gpu_id}] Load weight begin. " + f"[gpu={self.gpu_id}] Load weight begin. " f"avail mem={get_available_gpu_memory(self.gpu_id):.2f} GB" ) @@ -178,7 +178,7 @@ def load_model(self): cache_config=None, ) logger.info( - f"[gpu_id={self.gpu_id}] Load weight end. " + f"[gpu={self.gpu_id}] Load weight end. " f"type={type(self.model).__name__}, " f"dtype={self.dtype}, " f"avail mem={get_available_gpu_memory(self.gpu_id):.2f} GB" @@ -229,7 +229,7 @@ def init_memory_pool(self, total_gpu_memory, max_num_reqs=None): layer_num=self.model_config.num_hidden_layers, ) logger.info( - f"[gpu_id={self.gpu_id}] Memory pool end. " + f"[gpu={self.gpu_id}] Memory pool end. " f"avail mem={get_available_gpu_memory(self.gpu_id):.2f} GB" ) @@ -280,7 +280,7 @@ def init_cuda_graphs(self): return logger.info( - f"[gpu_id={self.gpu_id}] Capture cuda graph begin. This can take up to several minutes." + f"[gpu={self.gpu_id}] Capture cuda graph begin. This can take up to several minutes." ) batch_size_list = [1, 2, 4] + [i * 8 for i in range(1, 17)] self.cuda_graph_runner = CudaGraphRunner( diff --git a/python/sglang/srt/managers/controller/tp_worker.py b/python/sglang/srt/managers/controller/tp_worker.py index b84bdc33a2..94e535d147 100644 --- a/python/sglang/srt/managers/controller/tp_worker.py +++ b/python/sglang/srt/managers/controller/tp_worker.py @@ -55,7 +55,7 @@ ) from sglang.utils import get_exception_traceback -logger = logging.getLogger("srt.tp_worker") +logger = logging.getLogger(__name__) class ModelTpServer: @@ -132,7 +132,7 @@ def __init__( # Print info logger.info( - f"[gpu_id={self.gpu_id}] " + f"[gpu={self.gpu_id}] " f"max_total_num_tokens={self.max_total_num_tokens}, " f"max_prefill_tokens={self.max_prefill_tokens}, " f"max_running_requests={self.max_running_requests}, " @@ -256,7 +256,7 @@ def print_stats(self): self.num_generated_tokens = 0 self.last_stats_tic = time.time() logger.info( - f"[gpu_id={self.gpu_id}] Decode batch. " + f"[gpu={self.gpu_id}] Decode batch. " f"#running-req: {len(self.running_batch.reqs)}, " f"#token: {num_used}, " f"token usage: {num_used / self.max_total_num_tokens:.2f}, " @@ -434,7 +434,7 @@ def get_new_prefill_batch(self) -> Optional[Batch]: self.tree_cache_metrics["hit"] / self.tree_cache_metrics["total"] ) logger.info( - f"[gpu_id={self.gpu_id}] Prefill batch. " + f"[gpu={self.gpu_id}] Prefill batch. " f"#new-seq: {len(can_run_list)}, " f"#new-token: {new_batch_input_tokens}, " f"#cached-token: {hit_tokens}, " diff --git a/python/sglang/srt/model_loader/utils.py b/python/sglang/srt/model_loader/utils.py index c1d1b2ff37..9d6520e2ae 100644 --- a/python/sglang/srt/model_loader/utils.py +++ b/python/sglang/srt/model_loader/utils.py @@ -38,7 +38,7 @@ from sglang.srt.layers.quantization import get_quantization_config -logger = logging.getLogger("srt.model_loader") +logger = logging.getLogger(__name__) temp_dir = tempfile.gettempdir()