sgl-project · merrymercy · Nov 9, 2024 · Nov 8, 2024 · Nov 8, 2024 · Nov 8, 2024
diff --git a/python/pyproject.toml b/python/pyproject.toml
@@ -17,7 +17,7 @@ dependencies = ["requests", "tqdm", "numpy", "IPython"]
 
 [project.optional-dependencies]
 runtime_common = ["aiohttp", "decord", "fastapi", "hf_transfer", "huggingface_hub", "interegular",
-    "orjson", "packaging", "pillow", "psutil", "pydantic", "python-multipart",
+    "orjson", "packaging", "pillow", "prometheus-client>=0.20.0", "psutil", "pydantic", "python-multipart",
     "torchao", "uvicorn", "uvloop", "zmq",
     "outlines>=0.0.44", "modelscope"]
 srt = ["sglang[runtime_common]", "torch", "vllm==0.6.3.post1"]

@@ -213,19 +213,67 @@ def __init__(self, labelnames: List[str], max_model_len):
             name="sglang:e2e_request_latency_seconds",
             documentation="Histogram of End-to-end request latency in seconds",
             labelnames=labelnames,
-            buckets=build_1_2_5_buckets(max_model_len),
+            buckets=[
+                0.3,
+                0.5,
+                0.8,
+                1.0,
+                1.5,
+                2.0,
+                2.5,
+                5.0,
+                10.0,
+                15.0,
+                20.0,
+                30.0,
+                40.0,
+                50.0,
+                60.0,
+            ],
         )
         self.histogram_time_waiting_requests = Histogram(
             name="sglang:waiting_request_latency_seconds",
             documentation="Histogram of request waiting time in seconds",
             labelnames=labelnames,
-            buckets=build_1_2_5_buckets(max_model_len),
+            buckets=[
+                0.3,
+                0.5,
+                0.8,
+                1.0,
+                1.5,
+                2.0,
+                2.5,
+                5.0,
+                10.0,
+                15.0,
+                20.0,
+                30.0,
+                40.0,
+                50.0,
+                60.0,
+            ],
         )
         self.histogram_time_decode_requests = Histogram(
             name="sglang:decode_request_latency_seconds",
             documentation="Histogram of request decoding time in seconds",
             labelnames=labelnames,
-            buckets=build_1_2_5_buckets(max_model_len),
+            buckets=[
+                0.3,
+                0.5,
+                0.8,
+                1.0,
+                1.5,
+                2.0,
+                2.5,
+                5.0,
+                10.0,
+                15.0,
+                20.0,
+                30.0,
+                40.0,
+                50.0,
+                60.0,
+            ],
         )
 
 

@@ -34,15 +34,12 @@ class Stats:
     num_running_req: int = 0
     num_waiting_req: int = 0
     gen_throughput: float = 0.0
-    num_token: int = 0
-    token_usage: float = 0.0
     waiting_queue: int = 0
     time_e2e_requests: List[float] = field(default_factory=list)
     time_waiting_requests: List[float] = field(default_factory=list)
     time_decode_requests: List[float] = field(default_factory=list)
     # system stats
     token_usage: float = 0.0
-    is_mixed_chunk: bool = False
     new_seq: int = 0
     new_token: int = 0
     cached_token: int = 0

@@ -446,6 +446,9 @@ def launch_server(
     2. Inter-process communication is done through ICP (each process uses a different port) via the ZMQ library.
     """
 
+    if server_args.enable_metrics:
+        _set_prometheus_env()
+
     launch_engine(server_args=server_args)
 
     # Add api key authorization
@@ -454,7 +457,6 @@ def launch_server(
 
     # add prometheus middleware
     if server_args.enable_metrics:
-        _set_prometheus_env()
         add_prometheus_middleware(app)
 
     # Send a warmup request

diff --git a/python/sglang/test/test_utils.py b/python/sglang/test/test_utils.py
@@ -404,6 +404,7 @@ def popen_launch_server(
     other_args: tuple = (),
     env: Optional[dict] = None,
     return_stdout_stderr: Optional[tuple] = None,
+    enable_metrics: bool = False,
 ):
     _, host, port = base_url.split(":")
     host = host[2:]
@@ -422,6 +423,8 @@ def popen_launch_server(
     ]
     if api_key:
         command += ["--api-key", api_key]
+    if enable_metrics:
+        command += ["--enable-metrics"]
 
     if return_stdout_stderr:
         process = subprocess.Popen(

@@ -0,0 +1,84 @@
+import unittest
+from types import SimpleNamespace
+
+import requests
+
+from sglang.srt.utils import kill_child_process
+from sglang.test.run_eval import run_eval
+from sglang.test.test_utils import (
+    DEFAULT_MODEL_NAME_FOR_TEST,
+    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+    DEFAULT_URL_FOR_TEST,
+    popen_launch_server,
+)
+
+TEST_MODEL = (
+    DEFAULT_MODEL_NAME_FOR_TEST  # I used "google/gemma-2-2b-it" for testing locally
+)
+
+
+class TestEnableMetrics(unittest.TestCase):
+    def test_metrics_enabled(self):
+        """Test that metrics endpoint returns data when enabled"""
+        # Launch server with metrics enabled
+        process = popen_launch_server(
+            model=TEST_MODEL,
+            base_url=DEFAULT_URL_FOR_TEST,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            enable_metrics=True,
+        )
+
+        try:
+            # Make a request to generate some metrics
+            response = requests.get(f"{DEFAULT_URL_FOR_TEST}/health_generate")
+            self.assertEqual(response.status_code, 200)
+
+            # Get metrics
+            metrics_response = requests.get(f"{DEFAULT_URL_FOR_TEST}/metrics")
+            self.assertEqual(metrics_response.status_code, 200)
+            metrics_content = metrics_response.text
+
+            # Verify essential metrics are present
+            essential_metrics = [
+                "sglang:prompt_tokens_total",
+                "sglang:generation_tokens_total",
+                "sglang:max_total_num_tokens",
+                "sglang:context_len",
+                "sglang:time_to_first_token_seconds",
+                "sglang:time_per_output_token_seconds",
+                "sglang:e2e_request_latency_seconds",
+            ]
+
+            for metric in essential_metrics:
+                self.assertIn(metric, metrics_content, f"Missing metric: {metric}")
+
+            # Verify model name label is present and correct
+            expected_model_name = TEST_MODEL
+            self.assertIn(f'model_name="{expected_model_name}"', metrics_content)
+            # Verify metrics have values (not empty)
+            self.assertIn("_sum{", metrics_content)
+            self.assertIn("_count{", metrics_content)
+            self.assertIn("_bucket{", metrics_content)
+
+        finally:
+            kill_child_process(process.pid, include_self=True)
+
+    def test_metrics_disabled(self):
+        """Test that metrics endpoint returns 404 when disabled"""
+        # Launch server with metrics disabled
+        process = popen_launch_server(
+            model=TEST_MODEL,
+            base_url=DEFAULT_URL_FOR_TEST,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            enable_metrics=False,
+        )
+
+        try:
+            response = requests.get(f"{DEFAULT_URL_FOR_TEST}/health_generate")
+            self.assertEqual(response.status_code, 200)
+            # Verify metrics endpoint is not available
+            metrics_response = requests.get(f"{DEFAULT_URL_FOR_TEST}/metrics")
+            self.assertEqual(metrics_response.status_code, 404)
+
+        finally:
+            kill_child_process(process.pid, include_self=True)