diff --git a/examples/monitoring/docker-compose.yaml b/examples/monitoring/docker-compose.yaml
new file mode 100644
index 00000000000..d121368cc85
--- /dev/null
+++ b/examples/monitoring/docker-compose.yaml
@@ -0,0 +1,16 @@
+services:
+  prometheus:
+    image: prom/prometheus:latest
+    network_mode: host
+    ports:
+      - "9090:9090"   # the default port used by Prometheus
+    volumes:
+      - ${PWD}/prometheus.yaml:/etc/prometheus/prometheus.yml # mount Prometheus config file
+
+  grafana:
+    image: grafana/grafana:latest
+    network_mode: host
+    depends_on:
+      - prometheus
+    ports:
+      - "3000:3000" # the default port used by Grafana
\ No newline at end of file
diff --git a/examples/monitoring/grafana.json b/examples/monitoring/grafana.json
new file mode 100644
index 00000000000..76e0c7f5c31
--- /dev/null
+++ b/examples/monitoring/grafana.json
@@ -0,0 +1,1724 @@
+{
+  "annotations": {
+    "list": [
+      {
+        "builtIn": 1,
+        "datasource": {
+          "type": "grafana",
+          "uid": "-- Grafana --"
+        },
+        "enable": true,
+        "hide": true,
+        "iconColor": "rgba(0, 211, 255, 1)",
+        "name": "Annotations & Alerts",
+        "type": "dashboard"
+      }
+    ]
+  },
+  "editable": true,
+  "fiscalYearStartMonth": 0,
+  "graphTooltip": 0,
+  "id": 1,
+  "links": [],
+  "panels": [
+    {
+      "datasource": {
+        "default": true,
+        "type": "prometheus",
+        "uid": "ddyfngn31dg5cf"
+      },
+      "description": "max-running-requests from server argument",
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "thresholds"
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              }
+            ]
+          }
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 3,
+        "w": 3,
+        "x": 0,
+        "y": 0
+      },
+      "id": 2,
+      "options": {
+        "colorMode": "value",
+        "graphMode": "none",
+        "justifyMode": "auto",
+        "orientation": "auto",
+        "percentChangeColorMode": "standard",
+        "reduceOptions": {
+          "calcs": [
+            "last"
+          ],
+          "fields": "",
+          "values": false
+        },
+        "showPercentChange": false,
+        "textMode": "auto",
+        "wideLayout": true
+      },
+      "pluginVersion": "11.2.0",
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "ddyfngn31dg5cf"
+          },
+          "disableTextWrap": false,
+          "editorMode": "builder",
+          "expr": "sglang:max_running_requests{instance=\"$instance\", name=\"$model_name\"}",
+          "fullMetaSearch": false,
+          "includeNullMetadata": true,
+          "instant": false,
+          "legendFormat": "__auto",
+          "range": true,
+          "refId": "A",
+          "useBackend": false
+        }
+      ],
+      "title": "Max Running Requests",
+      "type": "stat"
+    },
+    {
+      "datasource": {
+        "default": true,
+        "type": "prometheus",
+        "uid": "ddyfngn31dg5cf"
+      },
+      "description": "Supported context length with loaded model",
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "thresholds"
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              }
+            ]
+          }
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 3,
+        "w": 3,
+        "x": 3,
+        "y": 0
+      },
+      "id": 1,
+      "options": {
+        "colorMode": "value",
+        "graphMode": "none",
+        "justifyMode": "auto",
+        "orientation": "auto",
+        "percentChangeColorMode": "standard",
+        "reduceOptions": {
+          "calcs": [
+            "last"
+          ],
+          "fields": "",
+          "values": false
+        },
+        "showPercentChange": false,
+        "textMode": "auto",
+        "wideLayout": true
+      },
+      "pluginVersion": "11.2.0",
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "ddyfngn31dg5cf"
+          },
+          "disableTextWrap": false,
+          "editorMode": "builder",
+          "expr": "sglang:context_len{instance=\"$instance\", name=\"$model_name\"}",
+          "fullMetaSearch": false,
+          "includeNullMetadata": true,
+          "instant": false,
+          "legendFormat": "__auto",
+          "range": true,
+          "refId": "A",
+          "useBackend": false
+        }
+      ],
+      "title": "Max Context Length",
+      "type": "stat"
+    },
+    {
+      "datasource": {
+        "default": true,
+        "type": "prometheus",
+        "uid": "ddyfngn31dg5cf"
+      },
+      "description": "max_total_tokens",
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "thresholds"
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              }
+            ]
+          }
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 3,
+        "w": 3,
+        "x": 6,
+        "y": 0
+      },
+      "id": 4,
+      "options": {
+        "colorMode": "value",
+        "graphMode": "none",
+        "justifyMode": "auto",
+        "orientation": "auto",
+        "percentChangeColorMode": "standard",
+        "reduceOptions": {
+          "calcs": [
+            "last"
+          ],
+          "fields": "",
+          "values": false
+        },
+        "showPercentChange": false,
+        "textMode": "auto",
+        "wideLayout": true
+      },
+      "pluginVersion": "11.2.0",
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "ddyfngn31dg5cf"
+          },
+          "disableTextWrap": false,
+          "editorMode": "builder",
+          "expr": "sglang:max_total_num_tokens{instance=\"$instance\", name=\"$model_name\"}",
+          "fullMetaSearch": false,
+          "includeNullMetadata": true,
+          "instant": false,
+          "legendFormat": "__auto",
+          "range": true,
+          "refId": "A",
+          "useBackend": false
+        }
+      ],
+      "title": "Max Total Num Tokens",
+      "type": "stat"
+    },
+    {
+      "datasource": {
+        "default": true,
+        "type": "prometheus",
+        "uid": "ddyfngn31dg5cf"
+      },
+      "description": "max_prefill_tokens from server args",
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "thresholds"
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              }
+            ]
+          }
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 3,
+        "w": 3,
+        "x": 9,
+        "y": 0
+      },
+      "id": 3,
+      "options": {
+        "colorMode": "value",
+        "graphMode": "none",
+        "justifyMode": "auto",
+        "orientation": "auto",
+        "percentChangeColorMode": "standard",
+        "reduceOptions": {
+          "calcs": [
+            "last"
+          ],
+          "fields": "",
+          "values": false
+        },
+        "showPercentChange": false,
+        "textMode": "auto",
+        "wideLayout": true
+      },
+      "pluginVersion": "11.2.0",
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "ddyfngn31dg5cf"
+          },
+          "disableTextWrap": false,
+          "editorMode": "builder",
+          "expr": "sglang:max_prefill_tokens{instance=\"$instance\", name=\"$model_name\"}",
+          "fullMetaSearch": false,
+          "includeNullMetadata": true,
+          "instant": false,
+          "legendFormat": "__auto",
+          "range": true,
+          "refId": "A",
+          "useBackend": false
+        }
+      ],
+      "title": "Max Prefill Tokens",
+      "type": "stat"
+    },
+    {
+      "datasource": {
+        "default": true,
+        "type": "prometheus",
+        "uid": "ddyfngn31dg5cf"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "thresholds"
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              }
+            ]
+          }
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 3,
+        "w": 6,
+        "x": 12,
+        "y": 0
+      },
+      "id": 6,
+      "options": {
+        "colorMode": "value",
+        "graphMode": "area",
+        "justifyMode": "auto",
+        "orientation": "auto",
+        "percentChangeColorMode": "standard",
+        "reduceOptions": {
+          "calcs": [
+            "lastNotNull"
+          ],
+          "fields": "",
+          "values": false
+        },
+        "showPercentChange": false,
+        "textMode": "auto",
+        "wideLayout": true
+      },
+      "pluginVersion": "11.2.0",
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "ddyfngn31dg5cf"
+          },
+          "disableTextWrap": false,
+          "editorMode": "builder",
+          "expr": "sglang:cached_token{instance=\"$instance\", name=\"$model_name\"}",
+          "fullMetaSearch": false,
+          "includeNullMetadata": true,
+          "instant": false,
+          "legendFormat": "{{__name__}}",
+          "range": true,
+          "refId": "A",
+          "useBackend": false
+        }
+      ],
+      "title": "Cached Tokens",
+      "type": "stat"
+    },
+    {
+      "datasource": {
+        "default": true,
+        "type": "prometheus",
+        "uid": "ddyfngn31dg5cf"
+      },
+      "description": "",
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "thresholds"
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              }
+            ]
+          }
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 3,
+        "w": 6,
+        "x": 18,
+        "y": 0
+      },
+      "id": 5,
+      "options": {
+        "colorMode": "value",
+        "graphMode": "area",
+        "justifyMode": "auto",
+        "orientation": "auto",
+        "percentChangeColorMode": "standard",
+        "reduceOptions": {
+          "calcs": [
+            "lastNotNull"
+          ],
+          "fields": "",
+          "values": false
+        },
+        "showPercentChange": false,
+        "textMode": "auto",
+        "wideLayout": true
+      },
+      "pluginVersion": "11.2.0",
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "ddyfngn31dg5cf"
+          },
+          "disableTextWrap": false,
+          "editorMode": "code",
+          "expr": "sglang:cache_hit_rate{instance=\"$instance\", name=\"$model_name\"}",
+          "fullMetaSearch": false,
+          "includeNullMetadata": true,
+          "instant": false,
+          "legendFormat": "{{__name__}}",
+          "range": true,
+          "refId": "A",
+          "useBackend": false
+        }
+      ],
+      "title": "Cache Hit Rate (%)",
+      "type": "stat"
+    },
+    {
+      "datasource": {
+        "default": true,
+        "type": "prometheus",
+        "uid": "ddyfngn31dg5cf"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "custom": {
+            "axisBorderShow": false,
+            "axisCenteredZero": false,
+            "axisColorMode": "text",
+            "axisLabel": "",
+            "axisPlacement": "auto",
+            "barAlignment": 0,
+            "barWidthFactor": 0.6,
+            "drawStyle": "line",
+            "fillOpacity": 0,
+            "gradientMode": "none",
+            "hideFrom": {
+              "legend": false,
+              "tooltip": false,
+              "viz": false
+            },
+            "insertNulls": false,
+            "lineInterpolation": "linear",
+            "lineWidth": 1,
+            "pointSize": 5,
+            "scaleDistribution": {
+              "type": "linear"
+            },
+            "showPoints": "auto",
+            "spanNulls": false,
+            "stacking": {
+              "group": "A",
+              "mode": "none"
+            },
+            "thresholdsStyle": {
+              "mode": "off"
+            }
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              },
+              {
+                "color": "red",
+                "value": 80
+              }
+            ]
+          }
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 0,
+        "y": 3
+      },
+      "id": 14,
+      "options": {
+        "legend": {
+          "calcs": [],
+          "displayMode": "list",
+          "placement": "bottom",
+          "showLegend": true
+        },
+        "tooltip": {
+          "mode": "single",
+          "sort": "none"
+        }
+      },
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "ddyfngn31dg5cf"
+          },
+          "disableTextWrap": false,
+          "editorMode": "code",
+          "expr": "histogram_quantile(0.99, sum by(le) (rate(sglang:e2e_request_latency_seconds_bucket{instance=\"$instance\", name=\"$model_name\"}[$__rate_interval])))",
+          "fullMetaSearch": false,
+          "includeNullMetadata": true,
+          "instant": false,
+          "legendFormat": "P99",
+          "range": true,
+          "refId": "A",
+          "useBackend": false
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "ddyfngn31dg5cf"
+          },
+          "disableTextWrap": false,
+          "editorMode": "code",
+          "expr": "histogram_quantile(0.9, sum by(le) (rate(sglang:e2e_request_latency_seconds_bucket{instance=\"$instance\", name=\"$model_name\"}[$__rate_interval])))",
+          "fullMetaSearch": false,
+          "hide": false,
+          "includeNullMetadata": true,
+          "instant": false,
+          "legendFormat": "P90",
+          "range": true,
+          "refId": "B",
+          "useBackend": false
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "ddyfngn31dg5cf"
+          },
+          "disableTextWrap": false,
+          "editorMode": "builder",
+          "expr": "histogram_quantile(0.95, sum by(le) (rate(sglang:e2e_request_latency_seconds_bucket{instance=\"$instance\", name=\"$model_name\"}[$__rate_interval])))",
+          "fullMetaSearch": false,
+          "hide": false,
+          "includeNullMetadata": true,
+          "instant": false,
+          "legendFormat": "P95",
+          "range": true,
+          "refId": "C",
+          "useBackend": false
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "ddyfngn31dg5cf"
+          },
+          "disableTextWrap": false,
+          "editorMode": "builder",
+          "expr": "histogram_quantile(0.5, sum by(le) (rate(sglang:e2e_request_latency_seconds_bucket{instance=\"$instance\", name=\"$model_name\"}[$__rate_interval])))",
+          "fullMetaSearch": false,
+          "hide": false,
+          "includeNullMetadata": true,
+          "instant": false,
+          "legendFormat": "P50",
+          "range": true,
+          "refId": "D",
+          "useBackend": false
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "ddyfngn31dg5cf"
+          },
+          "disableTextWrap": false,
+          "editorMode": "builder",
+          "expr": "rate(sglang:e2e_request_latency_seconds_sum{instance=\"$instance\", name=\"$model_name\"}[$__rate_interval]) / rate(sglang:e2e_request_latency_seconds_count[$__rate_interval])",
+          "fullMetaSearch": false,
+          "hide": false,
+          "includeNullMetadata": true,
+          "instant": false,
+          "legendFormat": "Average",
+          "range": true,
+          "refId": "E",
+          "useBackend": false
+        }
+      ],
+      "title": "E2E Request Latency (S)",
+      "type": "timeseries"
+    },
+    {
+      "datasource": {
+        "default": true,
+        "type": "prometheus",
+        "uid": "ddyfngn31dg5cf"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "custom": {
+            "axisBorderShow": false,
+            "axisCenteredZero": false,
+            "axisColorMode": "text",
+            "axisLabel": "",
+            "axisPlacement": "auto",
+            "barAlignment": 0,
+            "barWidthFactor": 0.6,
+            "drawStyle": "line",
+            "fillOpacity": 0,
+            "gradientMode": "none",
+            "hideFrom": {
+              "legend": false,
+              "tooltip": false,
+              "viz": false
+            },
+            "insertNulls": false,
+            "lineInterpolation": "linear",
+            "lineWidth": 1,
+            "pointSize": 5,
+            "scaleDistribution": {
+              "type": "linear"
+            },
+            "showPoints": "auto",
+            "spanNulls": false,
+            "stacking": {
+              "group": "A",
+              "mode": "none"
+            },
+            "thresholdsStyle": {
+              "mode": "off"
+            }
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              },
+              {
+                "color": "red",
+                "value": 80
+              }
+            ]
+          }
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 12,
+        "y": 3
+      },
+      "id": 18,
+      "options": {
+        "legend": {
+          "calcs": [],
+          "displayMode": "list",
+          "placement": "bottom",
+          "showLegend": true
+        },
+        "tooltip": {
+          "mode": "single",
+          "sort": "none"
+        }
+      },
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "ddyfngn31dg5cf"
+          },
+          "editorMode": "code",
+          "expr": "sglang:gen_throughput",
+          "instant": false,
+          "legendFormat": "__auto",
+          "range": true,
+          "refId": "A"
+        }
+      ],
+      "title": "Generation Throughput (Token / S)",
+      "type": "timeseries"
+    },
+    {
+      "datasource": {
+        "default": true,
+        "type": "prometheus",
+        "uid": "ddyfngn31dg5cf"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "custom": {
+            "axisBorderShow": false,
+            "axisCenteredZero": false,
+            "axisColorMode": "text",
+            "axisLabel": "",
+            "axisPlacement": "auto",
+            "barAlignment": 0,
+            "barWidthFactor": 0.6,
+            "drawStyle": "line",
+            "fillOpacity": 0,
+            "gradientMode": "none",
+            "hideFrom": {
+              "legend": false,
+              "tooltip": false,
+              "viz": false
+            },
+            "insertNulls": false,
+            "lineInterpolation": "linear",
+            "lineWidth": 1,
+            "pointSize": 5,
+            "scaleDistribution": {
+              "type": "linear"
+            },
+            "showPoints": "auto",
+            "spanNulls": false,
+            "stacking": {
+              "group": "A",
+              "mode": "none"
+            },
+            "thresholdsStyle": {
+              "mode": "off"
+            }
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              },
+              {
+                "color": "red",
+                "value": 80
+              }
+            ]
+          }
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 0,
+        "y": 11
+      },
+      "id": 7,
+      "options": {
+        "legend": {
+          "calcs": [],
+          "displayMode": "list",
+          "placement": "bottom",
+          "showLegend": true
+        },
+        "tooltip": {
+          "mode": "single",
+          "sort": "none"
+        }
+      },
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "ddyfngn31dg5cf"
+          },
+          "disableTextWrap": false,
+          "editorMode": "builder",
+          "expr": "sglang:num_requests_running{instance=\"$instance\", name=\"$model_name\"}",
+          "fullMetaSearch": false,
+          "includeNullMetadata": true,
+          "instant": false,
+          "legendFormat": "{{__name__}}",
+          "range": true,
+          "refId": "A",
+          "useBackend": false
+        }
+      ],
+      "title": "Num Requests Running",
+      "type": "timeseries"
+    },
+    {
+      "datasource": {
+        "default": true,
+        "type": "prometheus",
+        "uid": "ddyfngn31dg5cf"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "custom": {
+            "axisBorderShow": false,
+            "axisCenteredZero": false,
+            "axisColorMode": "text",
+            "axisLabel": "",
+            "axisPlacement": "auto",
+            "barAlignment": 0,
+            "barWidthFactor": 0.6,
+            "drawStyle": "line",
+            "fillOpacity": 0,
+            "gradientMode": "none",
+            "hideFrom": {
+              "legend": false,
+              "tooltip": false,
+              "viz": false
+            },
+            "insertNulls": false,
+            "lineInterpolation": "linear",
+            "lineWidth": 1,
+            "pointSize": 5,
+            "scaleDistribution": {
+              "type": "linear"
+            },
+            "showPoints": "auto",
+            "spanNulls": false,
+            "stacking": {
+              "group": "A",
+              "mode": "none"
+            },
+            "thresholdsStyle": {
+              "mode": "off"
+            }
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              },
+              {
+                "color": "red",
+                "value": 80
+              }
+            ]
+          }
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 12,
+        "y": 11
+      },
+      "id": 8,
+      "options": {
+        "legend": {
+          "calcs": [],
+          "displayMode": "list",
+          "placement": "bottom",
+          "showLegend": true
+        },
+        "tooltip": {
+          "mode": "single",
+          "sort": "none"
+        }
+      },
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "ddyfngn31dg5cf"
+          },
+          "disableTextWrap": false,
+          "editorMode": "builder",
+          "expr": "sglang:num_requests_waiting{instance=\"$instance\", name=\"$model_name\"}",
+          "fullMetaSearch": false,
+          "includeNullMetadata": true,
+          "instant": false,
+          "legendFormat": "{{__name__}}",
+          "range": true,
+          "refId": "A",
+          "useBackend": false
+        }
+      ],
+      "title": "Number of Requests Waiting",
+      "type": "timeseries"
+    },
+    {
+      "datasource": {
+        "default": true,
+        "type": "prometheus",
+        "uid": "ddyfngn31dg5cf"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "custom": {
+            "axisBorderShow": false,
+            "axisCenteredZero": false,
+            "axisColorMode": "text",
+            "axisLabel": "",
+            "axisPlacement": "auto",
+            "barAlignment": 0,
+            "barWidthFactor": 0.6,
+            "drawStyle": "line",
+            "fillOpacity": 0,
+            "gradientMode": "none",
+            "hideFrom": {
+              "legend": false,
+              "tooltip": false,
+              "viz": false
+            },
+            "insertNulls": false,
+            "lineInterpolation": "linear",
+            "lineWidth": 1,
+            "pointSize": 5,
+            "scaleDistribution": {
+              "type": "linear"
+            },
+            "showPoints": "auto",
+            "spanNulls": false,
+            "stacking": {
+              "group": "A",
+              "mode": "none"
+            },
+            "thresholdsStyle": {
+              "mode": "off"
+            }
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              },
+              {
+                "color": "red",
+                "value": 80
+              }
+            ]
+          }
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 0,
+        "y": 19
+      },
+      "id": 16,
+      "options": {
+        "legend": {
+          "calcs": [],
+          "displayMode": "list",
+          "placement": "bottom",
+          "showLegend": true
+        },
+        "tooltip": {
+          "mode": "single",
+          "sort": "none"
+        }
+      },
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "ddyfngn31dg5cf"
+          },
+          "disableTextWrap": false,
+          "editorMode": "code",
+          "expr": "histogram_quantile(0.99, sum by(le) (rate(sglang:decode_request_latency_seconds_bucket{instance=\"$instance\", name=\"$model_name\"}[$__rate_interval])))",
+          "fullMetaSearch": false,
+          "includeNullMetadata": true,
+          "instant": false,
+          "legendFormat": "P99",
+          "range": true,
+          "refId": "A",
+          "useBackend": false
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "ddyfngn31dg5cf"
+          },
+          "disableTextWrap": false,
+          "editorMode": "code",
+          "expr": "histogram_quantile(0.9, sum by(le) (rate(sglang:decode_request_latency_seconds_bucket{instance=\"$instance\", name=\"$model_name\"}[$__rate_interval])))",
+          "fullMetaSearch": false,
+          "hide": false,
+          "includeNullMetadata": true,
+          "instant": false,
+          "legendFormat": "P90",
+          "range": true,
+          "refId": "B",
+          "useBackend": false
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "ddyfngn31dg5cf"
+          },
+          "disableTextWrap": false,
+          "editorMode": "code",
+          "expr": "histogram_quantile(0.95, sum by(le) (rate(sglang:decode_request_latency_seconds_bucket{instance=\"$instance\", name=\"$model_name\"}[$__rate_interval])))",
+          "fullMetaSearch": false,
+          "hide": false,
+          "includeNullMetadata": true,
+          "instant": false,
+          "legendFormat": "P95",
+          "range": true,
+          "refId": "C",
+          "useBackend": false
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "ddyfngn31dg5cf"
+          },
+          "disableTextWrap": false,
+          "editorMode": "code",
+          "expr": "histogram_quantile(0.5, sum by(le) (rate(sglang:decode_request_latency_seconds_bucket{instance=\"$instance\", name=\"$model_name\"}[$__rate_interval])))",
+          "fullMetaSearch": false,
+          "hide": false,
+          "includeNullMetadata": true,
+          "instant": false,
+          "legendFormat": "P50",
+          "range": true,
+          "refId": "D",
+          "useBackend": false
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "ddyfngn31dg5cf"
+          },
+          "disableTextWrap": false,
+          "editorMode": "code",
+          "expr": "rate(sglang:decode_request_latency_seconds_sum{instance=\"$instance\", name=\"$model_name\"}[$__rate_interval]) / rate(sglang:decode_request_latency_seconds_count[$__rate_interval])",
+          "fullMetaSearch": false,
+          "hide": false,
+          "includeNullMetadata": true,
+          "instant": false,
+          "legendFormat": "Average",
+          "range": true,
+          "refId": "E",
+          "useBackend": false
+        }
+      ],
+      "title": "Time Request Decoding (S)",
+      "type": "timeseries"
+    },
+    {
+      "datasource": {
+        "default": true,
+        "type": "prometheus",
+        "uid": "ddyfngn31dg5cf"
+      },
+      "description": "Time requests waiting before added to batch",
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "custom": {
+            "axisBorderShow": false,
+            "axisCenteredZero": false,
+            "axisColorMode": "text",
+            "axisLabel": "",
+            "axisPlacement": "auto",
+            "barAlignment": 0,
+            "barWidthFactor": 0.6,
+            "drawStyle": "line",
+            "fillOpacity": 0,
+            "gradientMode": "none",
+            "hideFrom": {
+              "legend": false,
+              "tooltip": false,
+              "viz": false
+            },
+            "insertNulls": false,
+            "lineInterpolation": "linear",
+            "lineWidth": 1,
+            "pointSize": 5,
+            "scaleDistribution": {
+              "type": "linear"
+            },
+            "showPoints": "auto",
+            "spanNulls": false,
+            "stacking": {
+              "group": "A",
+              "mode": "none"
+            },
+            "thresholdsStyle": {
+              "mode": "off"
+            }
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              },
+              {
+                "color": "red",
+                "value": 80
+              }
+            ]
+          }
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 12,
+        "y": 19
+      },
+      "id": 15,
+      "options": {
+        "legend": {
+          "calcs": [],
+          "displayMode": "list",
+          "placement": "bottom",
+          "showLegend": true
+        },
+        "tooltip": {
+          "mode": "single",
+          "sort": "none"
+        }
+      },
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "ddyfngn31dg5cf"
+          },
+          "editorMode": "code",
+          "expr": "histogram_quantile(0.99, sum by (le) (rate(sglang:waiting_request_latency_seconds_bucket[$__rate_interval])))",
+          "instant": false,
+          "legendFormat": "P99",
+          "range": true,
+          "refId": "A"
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "ddyfngn31dg5cf"
+          },
+          "editorMode": "code",
+          "expr": "histogram_quantile(0.95, sum by (le) (rate(sglang:waiting_request_latency_seconds_bucket[$__rate_interval])))",
+          "hide": false,
+          "instant": false,
+          "legendFormat": "P95",
+          "range": true,
+          "refId": "B"
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "ddyfngn31dg5cf"
+          },
+          "editorMode": "code",
+          "expr": "histogram_quantile(0.9, sum by (le) (rate(sglang:waiting_request_latency_seconds_bucket[$__rate_interval])))",
+          "hide": false,
+          "instant": false,
+          "legendFormat": "P90",
+          "range": true,
+          "refId": "C"
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "ddyfngn31dg5cf"
+          },
+          "editorMode": "code",
+          "expr": "histogram_quantile(0.5, sum by (le) (rate(sglang:waiting_request_latency_seconds_bucket[$__rate_interval])))",
+          "hide": false,
+          "instant": false,
+          "legendFormat": "P50",
+          "range": true,
+          "refId": "D"
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "ddyfngn31dg5cf"
+          },
+          "editorMode": "code",
+          "expr": "rate(sglang:waiting_request_latency_seconds_sum[$__rate_interval])\r\n/\r\nrate(sglang:waiting_request_latency_seconds_count[$__rate_interval])",
+          "hide": false,
+          "instant": false,
+          "legendFormat": "Average",
+          "range": true,
+          "refId": "E"
+        }
+      ],
+      "title": "Time Request Waiting (S)",
+      "type": "timeseries"
+    },
+    {
+      "datasource": {
+        "default": true,
+        "type": "prometheus",
+        "uid": "ddyfngn31dg5cf"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "custom": {
+            "axisBorderShow": false,
+            "axisCenteredZero": false,
+            "axisColorMode": "text",
+            "axisLabel": "",
+            "axisPlacement": "auto",
+            "barAlignment": 0,
+            "barWidthFactor": 0.6,
+            "drawStyle": "line",
+            "fillOpacity": 0,
+            "gradientMode": "none",
+            "hideFrom": {
+              "legend": false,
+              "tooltip": false,
+              "viz": false
+            },
+            "insertNulls": false,
+            "lineInterpolation": "linear",
+            "lineWidth": 1,
+            "pointSize": 5,
+            "scaleDistribution": {
+              "type": "linear"
+            },
+            "showPoints": "auto",
+            "spanNulls": false,
+            "stacking": {
+              "group": "A",
+              "mode": "none"
+            },
+            "thresholdsStyle": {
+              "mode": "off"
+            }
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              },
+              {
+                "color": "red",
+                "value": 80
+              }
+            ]
+          }
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 0,
+        "y": 27
+      },
+      "id": 11,
+      "options": {
+        "legend": {
+          "calcs": [],
+          "displayMode": "list",
+          "placement": "bottom",
+          "showLegend": true
+        },
+        "tooltip": {
+          "mode": "single",
+          "sort": "none"
+        }
+      },
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "ddyfngn31dg5cf"
+          },
+          "disableTextWrap": false,
+          "editorMode": "code",
+          "expr": "sum(rate(sglang:request_prompt_tokens_sum{instance=\"$instance\", name=\"$model_name\"}[$__rate_interval])) by (instance, name)",
+          "fullMetaSearch": false,
+          "includeNullMetadata": true,
+          "instant": false,
+          "legendFormat": "{{__name__}}",
+          "range": true,
+          "refId": "A",
+          "useBackend": false
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "ddyfngn31dg5cf"
+          },
+          "disableTextWrap": false,
+          "editorMode": "code",
+          "expr": "",
+          "fullMetaSearch": false,
+          "hide": false,
+          "includeNullMetadata": true,
+          "instant": false,
+          "legendFormat": "__auto",
+          "range": true,
+          "refId": "B",
+          "useBackend": false
+        }
+      ],
+      "title": "Prompt Tokens",
+      "type": "timeseries"
+    },
+    {
+      "datasource": {
+        "default": true,
+        "type": "prometheus",
+        "uid": "ddyfngn31dg5cf"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "custom": {
+            "axisBorderShow": false,
+            "axisCenteredZero": false,
+            "axisColorMode": "text",
+            "axisLabel": "",
+            "axisPlacement": "auto",
+            "barAlignment": 0,
+            "barWidthFactor": 0.6,
+            "drawStyle": "line",
+            "fillOpacity": 0,
+            "gradientMode": "none",
+            "hideFrom": {
+              "legend": false,
+              "tooltip": false,
+              "viz": false
+            },
+            "insertNulls": false,
+            "lineInterpolation": "linear",
+            "lineWidth": 1,
+            "pointSize": 5,
+            "scaleDistribution": {
+              "type": "linear"
+            },
+            "showPoints": "auto",
+            "spanNulls": false,
+            "stacking": {
+              "group": "A",
+              "mode": "none"
+            },
+            "thresholdsStyle": {
+              "mode": "off"
+            }
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green",
+                "value": null
+              },
+              {
+                "color": "red",
+                "value": 80
+              }
+            ]
+          }
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 12,
+        "y": 27
+      },
+      "id": 17,
+      "options": {
+        "legend": {
+          "calcs": [],
+          "displayMode": "list",
+          "placement": "bottom",
+          "showLegend": true
+        },
+        "tooltip": {
+          "mode": "single",
+          "sort": "none"
+        }
+      },
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "ddyfngn31dg5cf"
+          },
+          "disableTextWrap": false,
+          "editorMode": "code",
+          "expr": "sum(rate(sglang:request_generation_tokens_sum{instance=\"$instance\", name=\"$model_name\"}[$__rate_interval])) by (instance, name)",
+          "fullMetaSearch": false,
+          "includeNullMetadata": true,
+          "instant": false,
+          "legendFormat": "{{__name__}}",
+          "range": true,
+          "refId": "A",
+          "useBackend": false
+        }
+      ],
+      "title": "Generated Tokens",
+      "type": "timeseries"
+    },
+    {
+      "datasource": {
+        "default": true,
+        "type": "prometheus",
+        "uid": "ddyfngn31dg5cf"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "custom": {
+            "hideFrom": {
+              "legend": false,
+              "tooltip": false,
+              "viz": false
+            },
+            "scaleDistribution": {
+              "type": "linear"
+            }
+          }
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 0,
+        "y": 35
+      },
+      "id": 13,
+      "options": {
+        "calculate": false,
+        "calculation": {
+          "yBuckets": {
+            "scale": {
+              "log": 2,
+              "type": "log"
+            }
+          }
+        },
+        "cellGap": 1,
+        "color": {
+          "exponent": 0.5,
+          "fill": "dark-orange",
+          "mode": "scheme",
+          "reverse": false,
+          "scale": "exponential",
+          "scheme": "Oranges",
+          "steps": 64
+        },
+        "exemplars": {
+          "color": "rgba(255,0,255,0.7)"
+        },
+        "filterValues": {
+          "le": 1e-9
+        },
+        "legend": {
+          "show": true
+        },
+        "rowsFrame": {
+          "layout": "auto"
+        },
+        "tooltip": {
+          "mode": "single",
+          "showColorScale": false,
+          "yHistogram": false
+        },
+        "yAxis": {
+          "axisPlacement": "left",
+          "reverse": false
+        }
+      },
+      "pluginVersion": "11.2.0",
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "ddyfngn31dg5cf"
+          },
+          "disableTextWrap": false,
+          "editorMode": "code",
+          "expr": "sum by(le) (increase(sglang:request_prompt_tokens_bucket{name=\"$model_name\", instance=\"$instance\"}[$__rate_interval]))",
+          "fullMetaSearch": false,
+          "includeNullMetadata": true,
+          "instant": false,
+          "legendFormat": "{{__name__}}",
+          "range": true,
+          "refId": "A",
+          "useBackend": false
+        }
+      ],
+      "title": "Request Prompt Tokens (WIP)",
+      "type": "heatmap"
+    },
+    {
+      "datasource": {
+        "default": true,
+        "type": "prometheus",
+        "uid": "ddyfngn31dg5cf"
+      },
+      "description": "",
+      "fieldConfig": {
+        "defaults": {
+          "custom": {
+            "hideFrom": {
+              "legend": false,
+              "tooltip": false,
+              "viz": false
+            },
+            "scaleDistribution": {
+              "type": "linear"
+            }
+          }
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 12,
+        "y": 35
+      },
+      "id": 12,
+      "options": {
+        "calculate": false,
+        "calculation": {
+          "xBuckets": {
+            "mode": "size",
+            "value": ""
+          },
+          "yBuckets": {
+            "mode": "size",
+            "scale": {
+              "log": 2,
+              "type": "log"
+            },
+            "value": ""
+          }
+        },
+        "cellGap": 1,
+        "color": {
+          "exponent": 0.5,
+          "fill": "dark-orange",
+          "min": 0,
+          "mode": "scheme",
+          "reverse": false,
+          "scale": "exponential",
+          "scheme": "Spectral",
+          "steps": 64
+        },
+        "exemplars": {
+          "color": "rgba(255,0,255,0.7)"
+        },
+        "filterValues": {
+          "le": 1e-9
+        },
+        "legend": {
+          "show": true
+        },
+        "rowsFrame": {
+          "layout": "auto",
+          "value": "Request count"
+        },
+        "tooltip": {
+          "mode": "single",
+          "showColorScale": false,
+          "yHistogram": true
+        },
+        "yAxis": {
+          "axisLabel": "Generation Length",
+          "axisPlacement": "left",
+          "reverse": false,
+          "unit": "none"
+        }
+      },
+      "pluginVersion": "11.2.0",
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "ddyfngn31dg5cf"
+          },
+          "disableTextWrap": false,
+          "editorMode": "code",
+          "expr": "sum by(le) (increase(sglang:request_generation_tokens_bucket{name=\"$model_name\", instance=\"$instance\"}[$__rate_interval]))",
+          "fullMetaSearch": false,
+          "includeNullMetadata": true,
+          "instant": false,
+          "legendFormat": "{{__name__}}",
+          "range": true,
+          "refId": "A",
+          "useBackend": false
+        }
+      ],
+      "title": "Request Generation Tokens (WIP)",
+      "type": "heatmap"
+    }
+  ],
+  "refresh": "5s",
+  "schemaVersion": 39,
+  "tags": [],
+  "templating": {
+    "list": [
+      {
+        "current": {
+          "selected": false,
+          "text": "127.0.0.1:30000",
+          "value": "127.0.0.1:30000"
+        },
+        "datasource": {
+          "type": "prometheus",
+          "uid": "ddyfngn31dg5cf"
+        },
+        "definition": "label_values(instance)",
+        "hide": 0,
+        "includeAll": false,
+        "label": "instance",
+        "multi": false,
+        "name": "instance",
+        "options": [],
+        "query": {
+          "qryType": 1,
+          "query": "label_values(instance)",
+          "refId": "PrometheusVariableQueryEditor-VariableQuery"
+        },
+        "refresh": 1,
+        "regex": "",
+        "skipUrlSync": false,
+        "sort": 0,
+        "type": "query"
+      },
+      {
+        "current": {
+          "selected": false,
+          "text": "google/gemma-2-9b-it",
+          "value": "google/gemma-2-9b-it"
+        },
+        "datasource": {
+          "type": "prometheus",
+          "uid": "ddyfngn31dg5cf"
+        },
+        "definition": "label_values(name)",
+        "hide": 0,
+        "includeAll": false,
+        "label": "name",
+        "multi": false,
+        "name": "model_name",
+        "options": [],
+        "query": {
+          "qryType": 1,
+          "query": "label_values(name)",
+          "refId": "PrometheusVariableQueryEditor-VariableQuery"
+        },
+        "refresh": 1,
+        "regex": "",
+        "skipUrlSync": false,
+        "sort": 0,
+        "type": "query"
+      }
+    ]
+  },
+  "time": {
+    "from": "now-5m",
+    "to": "now"
+  },
+  "timepicker": {},
+  "timezone": "browser",
+  "title": "SGLang Dashboard",
+  "uid": "ddyp55uq7brpcc",
+  "version": 12,
+  "weekStart": ""
+}
\ No newline at end of file
diff --git a/examples/monitoring/prometheus.yaml b/examples/monitoring/prometheus.yaml
new file mode 100644
index 00000000000..ba16ac3bd30
--- /dev/null
+++ b/examples/monitoring/prometheus.yaml
@@ -0,0 +1,10 @@
+# prometheus.yaml
+global:
+  scrape_interval: 5s
+  evaluation_interval: 30s
+
+scrape_configs:
+  - job_name: sglang
+    static_configs:
+      - targets:
+          - '127.0.0.1:30000'
diff --git a/python/pyproject.toml b/python/pyproject.toml
index 69bfb7ca4a2..136d0fc205f 100644
--- a/python/pyproject.toml
+++ b/python/pyproject.toml
@@ -23,7 +23,9 @@ dependencies = [
 srt = ["aiohttp", "decord", "fastapi", "hf_transfer", "huggingface_hub", "interegular",
        "packaging", "pillow", "psutil", "pydantic", "python-multipart",
        "torch", "torchao", "uvicorn", "uvloop", "zmq",
-       "vllm==0.5.5", "outlines>=0.0.44", "modelscope"]
+       "vllm==0.5.5", "outlines>=0.0.44", "modelscope", 
+       "prometheus-client>=0.20.0"]
+
 openai = ["openai>=1.0", "tiktoken"]
 anthropic = ["anthropic>=0.20.0"]
 litellm = ["litellm>=1.0.0"]
diff --git a/python/sglang/srt/managers/policy_scheduler.py b/python/sglang/srt/managers/policy_scheduler.py
index ada3904182c..2f7a881b7d7 100644
--- a/python/sglang/srt/managers/policy_scheduler.py
+++ b/python/sglang/srt/managers/policy_scheduler.py
@@ -17,6 +17,7 @@
 
 import os
 import random
+import time
 from collections import defaultdict
 from contextlib import contextmanager
 from typing import Dict, List, Optional
@@ -286,6 +287,7 @@ def add_one_req(self, req: Req):
                 or (req.return_logprob and req.normalized_prompt_logprob is None)
             ):
                 # Non-chunked prefill
+                req.queued_time = time.time()
                 self.can_run_list.append(req)
                 self.tree_cache.inc_lock_ref(req.last_node)
                 self._prefill_one_req(
@@ -301,6 +303,7 @@ def add_one_req(self, req: Req):
 
                 req.extend_input_len = trunc_len
                 req.fill_ids = req.fill_ids[: len(req.prefix_indices) + trunc_len]
+                req.queued_time = time.time()
                 self.can_run_list.append(req)
                 self.new_inflight_req = req
                 self.tree_cache.inc_lock_ref(req.last_node)
diff --git a/python/sglang/srt/managers/schedule_batch.py b/python/sglang/srt/managers/schedule_batch.py
index c4c91c7112e..23dcc8b659b 100644
--- a/python/sglang/srt/managers/schedule_batch.py
+++ b/python/sglang/srt/managers/schedule_batch.py
@@ -18,6 +18,7 @@
 """Meta data for requests and batches"""
 
 import logging
+import time
 from dataclasses import dataclass
 from typing import List, Optional, Tuple, Union
 
@@ -189,6 +190,16 @@ def __init__(
         self.regex_fsm_state: int = 0
         self.jump_forward_map: JumpForwardMap = None
 
+        # Lifetime traces
+        # time when request is created and added to waitlist
+        self.created_time = None
+        # time when request is added to prefill batech
+        self.queued_time = None
+        # time when request is being processed
+        self.started_time = None
+        # time when request is finished
+        self.finished_time = None
+
     # whether request reached finished condition
     def finished(self) -> bool:
         return self.finished_reason is not None
@@ -262,6 +273,7 @@ def check_finished(self):
             return
 
         if len(self.output_ids) >= self.sampling_params.max_new_tokens:
+            self.finished_time = time.time()
             self.finished_reason = FINISH_LENGTH(
                 length=self.sampling_params.max_new_tokens
             )
@@ -275,6 +287,7 @@ def check_finished(self):
             matched_eos |= last_token_id == self.tokenizer.eos_token_id
 
         if matched_eos and not self.sampling_params.ignore_eos:
+            self.finished_time = time.time()
             self.finished_reason = FINISH_MATCHED_TOKEN(matched=last_token_id)
             return
 
@@ -285,6 +298,7 @@ def check_finished(self):
 
             for stop_str in self.sampling_params.stop_strs:
                 if stop_str in tail_str or stop_str in self.decoded_text:
+                    self.finished_time = time.time()
                     self.finished_reason = FINISH_MATCHED_STR(matched=stop_str)
                     return
 
@@ -426,6 +440,10 @@ def alloc_token_slots(self, num_tokens: int):
 
         return out_cache_loc
 
+    def mark_reqs_started(self):
+        for req in self.reqs:
+            req.started_time = time.time()
+
     def prepare_for_extend(self, vocab_size: int):
         self.forward_mode = ForwardMode.EXTEND
 
diff --git a/python/sglang/srt/managers/tp_worker.py b/python/sglang/srt/managers/tp_worker.py
index b9690670073..c174bf31e00 100644
--- a/python/sglang/srt/managers/tp_worker.py
+++ b/python/sglang/srt/managers/tp_worker.py
@@ -54,6 +54,7 @@
 )
 from sglang.srt.mem_cache.chunk_cache import ChunkCache
 from sglang.srt.mem_cache.radix_cache import RadixCache
+from sglang.srt.metrics.metrics_types import Stats
 from sglang.srt.model_executor.model_runner import ModelRunner
 from sglang.srt.server_args import ServerArgs
 from sglang.srt.utils import (
@@ -138,6 +139,25 @@ def __init__(
             self.max_total_num_tokens - 1,
         )
 
+        # Setup Metrics and Collectors
+        self.time_e2e_requests: List[float] = []
+        self.time_waiting_requests: List[float] = []
+        self.time_decode_requests: List[float] = []
+        self._stats = Stats()
+        self._stats.max_total_num_tokens = self.max_total_num_tokens
+        self._stats.max_prefill_tokens = self.max_prefill_tokens
+        self._stats.max_running_requests = self.max_running_requests
+        self._stats.context_len = self.model_config.context_len
+        # Lazy loading to ensure prometheus is initialized
+        from sglang.srt.metrics.metrics_collector import PrometheusMetricsCollector
+
+        self.metrics_collector = PrometheusMetricsCollector(
+            labels={
+                "name": self.model_config.path,
+                # TODO: Add lora name/path in the future,
+            },
+            max_model_len=self.max_total_num_tokens,
+        )
         # Sync random seed across TP workers
         server_args.random_seed = broadcast_recv_input(
             [server_args.random_seed],
@@ -241,6 +261,10 @@ def exposed_step(self, recv_reqs: List):
 
             # Forward
             self.forward_step()
+            # log stats
+            if self.tp_rank == 0:
+                self.log_step_metrics()
+
         except Exception:
             logger.error("Exception in ModelTpServer:\n" + get_exception_traceback())
             raise
@@ -304,6 +328,49 @@ def print_decode_stats(self):
             f"#queue-req: {len(self.waiting_queue)}"
         )
 
+    def log_step_metrics(self):
+
+        self._stats.time_e2e_requests = self.time_e2e_requests
+        self._stats.time_waiting_requests = self.time_waiting_requests
+        self._stats.time_decode_requests = self.time_decode_requests
+
+        if self.running_batch:
+            num_used = self.max_total_num_tokens - (
+                self.token_to_kv_pool.available_size()
+                + self.tree_cache.evictable_size()
+            )
+            token_usage = num_used / self.max_total_num_tokens
+            throughput = self.num_generated_tokens / (time.time() - self.last_stats_tic)
+            self.num_generated_tokens = 0
+            self.last_stats_tic = time.time()
+
+            self._stats.num_running_req = len(self.running_batch.reqs)
+            self._stats.num_waiting_req = len(self.waiting_queue)
+            self._stats.gen_throughput = throughput
+            self._stats.num_token = num_used
+            self._stats.token_usage = token_usage
+        else:
+            self._stats.num_running_req = 0
+            self._stats.num_waiting_req = 0
+            self._stats.gen_throughput = 0.0
+            self._stats.num_token = 0
+            self._stats.token_usage = 0.0
+            self._stats.waiting_queue = 0
+            self._stats.cache_hit_rate = 0.0
+
+        if self.tree_cache_metrics["total"] > 0:
+            self._stats.cache_hit_rate = 100.0 * (
+                self.tree_cache_metrics["hit"] / self.tree_cache_metrics["total"]
+            )
+        else:
+            self._stats.cache_hit_rate = 0.0
+
+        self.time_e2e_requests = []
+        self.time_waiting_requests = []
+        self.time_decode_requests = []
+
+        self.log_metrics(self._stats)
+
     def check_memory(self):
         available_size = (
             self.token_to_kv_pool.available_size() + self.tree_cache.evictable_size()
@@ -405,7 +472,8 @@ def handle_generate_request(
             ),
             self.max_req_input_len - 1 - len(req.origin_input_ids),
         )
-
+        # TODO: add created time
+        req.created_time = time.time()
         self.waiting_queue.append(req)
 
     def handle_embedding_request(
@@ -424,6 +492,7 @@ def handle_embedding_request(
             )
             req.origin_input_ids = req.origin_input_ids[: self.max_req_input_len]
 
+        req.created_time = time.time()
         self.waiting_queue.append(req)
 
     def get_new_prefill_batch(self) -> Optional[ScheduleBatch]:
@@ -508,6 +577,9 @@ def get_new_prefill_batch(self) -> Optional[ScheduleBatch]:
             else:
                 tree_cache_hit_rate = 0.0
 
+            running_req = (
+                num_mixed_running if self.is_mixed_chunk == True else running_bs
+            )
             num_used = self.max_total_num_tokens - (
                 self.token_to_kv_pool.available_size()
                 + self.tree_cache.evictable_size()
@@ -536,6 +608,18 @@ def get_new_prefill_batch(self) -> Optional[ScheduleBatch]:
                     f"#queue-req: {len(self.waiting_queue) - len(can_run_list) + has_inflight}"
                 )
 
+            self._stats.is_mixed_chunk = self.is_mixed_chunk
+            self._stats.new_seq = len(can_run_list)
+            self._stats.new_token = adder.log_input_tokens
+            self._stats.cached_token = adder.log_hit_tokens
+            self._stats.cache_hit_rate = 100.0 * tree_cache_hit_rate
+            self._stats.running_req = running_req
+            self._stats.queue_req = (
+                len(self.waiting_queue) - len(can_run_list) + has_inflight
+            )
+            self._stats.token_usage = num_used / self.max_total_num_tokens
+            self.log_metrics(self._stats)
+
         # Return the new batch
         new_batch = ScheduleBatch.init_new(
             can_run_list,
@@ -549,6 +633,7 @@ def get_new_prefill_batch(self) -> Optional[ScheduleBatch]:
     def forward_prefill_batch(self, batch: ScheduleBatch):
         # Build batch tensors
         batch.prepare_for_extend(self.model_config.vocab_size)
+        batch.mark_reqs_started()
 
         decoding_reqs = []
         if self.is_mixed_chunk and self.running_batch is not None:
@@ -800,6 +885,9 @@ def handle_finished_requests(self, batch: ScheduleBatch):
         output_rids = []
         output_meta_info = []
         output_finished_reason: List[BaseFinishReason] = []
+        num_prompt_tokens_requests: List[int] = []
+        num_generation_tokens_requests: List[int] = []
+        finished_reason_requests: List[str] = []
         if self.model_runner.is_generation:
             output_vids = []
             decoded_texts = []
@@ -822,6 +910,9 @@ def handle_finished_requests(self, batch: ScheduleBatch):
                     or len(req.output_ids) == 1
                 )
             ):
+                self.time_e2e_requests.append(req.finished_time - req.created_time)
+                self.time_waiting_requests.append(req.queued_time - req.created_time)
+                self.time_decode_requests.append(req.finished_time - req.started_time)
                 output_rids.append(req.rid)
                 output_finished_reason.append(req.finished_reason)
                 if self.model_runner.is_generation:
@@ -837,6 +928,16 @@ def handle_finished_requests(self, batch: ScheduleBatch):
                         req.sampling_params.spaces_between_special_tokens
                     )
 
+                    num_prompt_tokens_requests.append(len(req.origin_input_ids))
+                    num_generation_tokens_requests.append(len(req.output_ids))
+                    finished_reason_requests.append(
+                        (
+                            req.finished_reason.to_json()
+                            if req.finished_reason is not None
+                            else None
+                        )
+                    )
+
                     meta_info = {
                         "prompt_tokens": len(req.origin_input_ids),
                         "completion_tokens": len(req.output_ids),
@@ -868,6 +969,12 @@ def handle_finished_requests(self, batch: ScheduleBatch):
                         "prompt_tokens": len(req.origin_input_ids),
                     }
                     output_meta_info.append(meta_info)
+        stats = Stats(
+            num_generation_tokens_requests=num_generation_tokens_requests,
+            num_prompt_tokens_requests=num_prompt_tokens_requests,
+            finished_reason_requests=finished_reason_requests,
+        )
+        self.log_metrics(stats)
 
         # Send to detokenizer
         if output_rids:
@@ -948,6 +1055,11 @@ def update_weights(self, recv_req):
             logger.error(message)
         return success, message
 
+    def log_metrics(self, stats):
+        """Collect metrics."""
+
+        self.metrics_collector.log_stats(stats)
+
 
 def run_tp_server(
     gpu_id: int,
diff --git a/python/sglang/srt/metrics/metrics_collector.py b/python/sglang/srt/metrics/metrics_collector.py
new file mode 100644
index 00000000000..9a06b13864a
--- /dev/null
+++ b/python/sglang/srt/metrics/metrics_collector.py
@@ -0,0 +1,261 @@
+"""
+Copyright 2023-2024 SGLang Team
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+"""Utilities for Prometheus Metrics Collection."""
+
+import logging
+from abc import ABC, abstractmethod
+from typing import Counter as CollectionsCounter
+from typing import Dict, List, Union
+
+import numpy as np
+from prometheus_client import Counter, Gauge, Histogram
+
+from sglang.srt.metrics.metrics_types import Stats
+
+
+class Metrics:
+    """
+    SGLang Metrics
+    """
+
+    def __init__(self, labelnames: List[str], max_model_len):
+
+        # Configuration Stats
+        self.max_total_num_tokens = Gauge(
+            name="sglang:max_total_num_tokens",
+            documentation="Maximum total number of tokens",
+            labelnames=labelnames,
+            multiprocess_mode="min",
+        )  # static across processes
+
+        self.max_prefill_tokens = Gauge(
+            name="sglang:max_prefill_tokens",
+            documentation="Maximum prefill tokens",
+            labelnames=labelnames,
+            multiprocess_mode="min",
+        )  # static across processes
+
+        self.max_running_requests = Gauge(
+            name="sglang:max_running_requests",
+            documentation="Maximum running requests",
+            labelnames=labelnames,
+            multiprocess_mode="min",
+        )  # static across processes
+
+        self.context_len = Gauge(
+            name="sglang:context_len",
+            documentation="Context length",
+            labelnames=labelnames,
+            multiprocess_mode="min",
+        )  # static across processes
+        # Decode Stats
+        self.num_running_sys = Gauge(
+            name="sglang:num_requests_running",
+            documentation="Number of requests currently running on GPU",
+            labelnames=labelnames,
+            multiprocess_mode="sum",
+        )
+        self.num_waiting_sys = Gauge(
+            name="sglang:num_requests_waiting",
+            documentation="Number of requests waiting to be processed.",
+            labelnames=labelnames,
+            multiprocess_mode="sum",
+        )
+        self.gen_throughput = Gauge(
+            name="sglang:gen_throughput",
+            documentation="Gen token throughput (token/s)",
+            labelnames=labelnames,
+            multiprocess_mode="sum",
+        )
+        self.token_usage = Gauge(
+            name="sglang:token_usage",
+            documentation="Total token usage",
+            labelnames=labelnames,
+            multiprocess_mode="sum",
+        )
+        # System Stats
+        #   KV Cache Usage in %
+        # self.gpu_cache_usage_sys = Gauge(
+        #     "gpu_cache_usage_perc",
+        #     "GPU KV-cache usage. 1 means 100 percent usage.",
+        #     labelnames=labelnames,
+        #     multiprocess_mode="sum")
+
+        self.new_seq = Gauge(
+            name="sglang:new_seq",
+            documentation="Number of new sequences",
+            labelnames=labelnames,
+            multiprocess_mode="sum",
+        )
+        self.new_token = Gauge(
+            name="sglang:new_token",
+            documentation="Number of new token",
+            labelnames=labelnames,
+            multiprocess_mode="sum",
+        )
+        #   Prefix caching block hit rate
+        self.cached_token = Gauge(
+            name="sglang:cached_token",
+            documentation="Number of cached token",
+            labelnames=labelnames,
+            multiprocess_mode="sum",
+        )
+        self.cache_hit_rate = Gauge(
+            name="sglang:cache_hit_rate",
+            documentation="Cache hit rate",
+            labelnames=labelnames,
+            multiprocess_mode="sum",
+        )
+        self.queue_req = Gauge(
+            name="sglang:queue_req",
+            documentation="Number of queued requests",
+            labelnames=labelnames,
+            multiprocess_mode="sum",
+        )
+
+        # Request Stats
+        #   Metadata
+        self.num_prompt_tokens_requests = Histogram(
+            name="sglang:request_prompt_tokens",
+            documentation="Number of prefill tokens processed",
+            labelnames=labelnames,
+            buckets=build_1_2_5_buckets(max_model_len),
+        )
+        self.num_generation_tokens_requests = Histogram(
+            name="sglang:request_generation_tokens",
+            documentation="Number of generation tokens processed.",
+            labelnames=labelnames,
+            buckets=build_1_2_5_buckets(max_model_len),
+        )
+        self.finished_reason_requests = Counter(
+            name="sglang:request_success_total",
+            documentation="Count of successfully processed requests.",
+            labelnames=labelnames + ["finished_reason"],
+        )
+        self.histogram_time_e2e_requests = Histogram(
+            name="sglang:e2e_request_latency_seconds",
+            documentation="Histogram of End-to-end request latency in seconds",
+            labelnames=labelnames,
+            buckets=build_1_2_5_buckets(max_model_len),
+        )
+        self.histogram_time_waiting_requests = Histogram(
+            name="sglang:waiting_request_latency_seconds",
+            documentation="Histogram of request waiting time in seconds",
+            labelnames=labelnames,
+            buckets=build_1_2_5_buckets(max_model_len),
+        )
+        self.histogram_time_decode_requests = Histogram(
+            name="sglang:decode_request_latency_seconds",
+            documentation="Histogram of request decoding time in seconds",
+            labelnames=labelnames,
+            buckets=build_1_2_5_buckets(max_model_len),
+        )
+
+
+class MetricsCollector(ABC):
+    """
+    SGLang Metrics Collector
+    """
+
+    @abstractmethod
+    def log_stats(self, stats: Stats) -> None:
+        pass
+
+
+class PrometheusMetricsCollector(MetricsCollector):
+    """
+    SGLang Metrics Collector
+    """
+
+    def __init__(self, labels: Dict[str, str], max_model_len: int) -> None:
+        self.labels = labels
+        self.metrics = Metrics(
+            labelnames=list(labels.keys()), max_model_len=max_model_len
+        )
+
+    def _log_gauge(self, gauge, data: Union[int, float]) -> None:
+        # Convenience function for logging to gauge.
+        gauge.labels(**self.labels).set(data)
+
+    def _log_counter(self, counter, data: Union[int, float]) -> None:
+        # Convenience function for logging to counter.
+        counter.labels(**self.labels).inc(data)
+
+    def _log_counter_labels(
+        self, counter, data: CollectionsCounter, label_key: str
+    ) -> None:
+        # Convenience function for collection counter of labels.
+        for label, count in data.items():
+            counter.labels(**{**self.labels, label_key: label}).inc(count)
+
+    def _log_histogram(self, histogram, data: Union[List[int], List[float]]) -> None:
+        # Convenience function for logging list to histogram.
+        for datum in data:
+            histogram.labels(**self.labels).observe(datum)
+
+    def log_stats(self, stats: Stats) -> None:
+        self._log_gauge(self.metrics.max_total_num_tokens, stats.max_total_num_tokens)
+        self._log_gauge(self.metrics.max_prefill_tokens, stats.max_prefill_tokens)
+        self._log_gauge(self.metrics.max_running_requests, stats.max_running_requests)
+        self._log_gauge(self.metrics.context_len, stats.context_len)
+        self._log_histogram(
+            self.metrics.num_prompt_tokens_requests, stats.num_prompt_tokens_requests
+        )
+        self._log_histogram(
+            self.metrics.num_generation_tokens_requests,
+            stats.num_generation_tokens_requests,
+        )
+        # self._log_gauge(self.metrics.gpu_cache_usage_sys, stats.gpu_cache_usage_sys)
+        self._log_gauge(self.metrics.num_running_sys, stats.num_running_req)
+        self._log_gauge(self.metrics.num_waiting_sys, stats.num_waiting_req)
+        self._log_gauge(self.metrics.gen_throughput, stats.gen_throughput)
+        self._log_gauge(self.metrics.token_usage, stats.token_usage)
+        self._log_histogram(
+            self.metrics.histogram_time_e2e_requests, stats.time_e2e_requests
+        )
+        self._log_histogram(
+            self.metrics.histogram_time_waiting_requests, stats.time_waiting_requests
+        )
+        self._log_histogram(
+            self.metrics.histogram_time_decode_requests, stats.time_decode_requests
+        )
+        self._log_gauge(self.metrics.new_seq, stats.new_seq)
+        self._log_gauge(self.metrics.new_token, stats.new_token)
+        self._log_gauge(self.metrics.cached_token, stats.cached_token)
+        self._log_gauge(self.metrics.cache_hit_rate, stats.cache_hit_rate)
+        self._log_gauge(self.metrics.queue_req, stats.queue_req)
+
+
+def build_1_2_5_buckets(max_value: int) -> List[int]:
+    """
+    Builds a list of buckets with increasing powers of 10 multiplied by
+    mantissa values (1, 2, 5) until the value exceeds the specified maximum.
+
+    Example:
+    >>> build_1_2_5_buckets(100)
+    [1, 2, 5, 10, 20, 50, 100]
+    """
+    mantissa_lst = [1, 2, 5]
+    exponent = 0
+    buckets: List[int] = []
+    while True:
+        for m in mantissa_lst:
+            value = m * 10**exponent
+            if value <= max_value:
+                buckets.append(value)
+            else:
+                return buckets
+        exponent += 1
diff --git a/python/sglang/srt/metrics/metrics_types.py b/python/sglang/srt/metrics/metrics_types.py
new file mode 100644
index 00000000000..adddd6c2f12
--- /dev/null
+++ b/python/sglang/srt/metrics/metrics_types.py
@@ -0,0 +1,54 @@
+"""
+Copyright 2023-2024 SGLang Team
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+"""Metrics Types"""
+
+from dataclasses import dataclass, field
+from typing import List
+
+
+@dataclass
+class Stats:
+    # config
+    max_total_num_tokens: int = 0
+    max_prefill_tokens: int = 0
+    max_running_requests: int = 0
+    context_len: int = 0
+    # request stats
+    num_prompt_tokens_requests: List[int] = field(default_factory=list)
+    num_generation_tokens_requests: List[int] = field(default_factory=list)
+    finished_reason_requests: List[str] = field(default_factory=list)
+    # decode stats
+    num_running_req: int = 0
+    num_waiting_req: int = 0
+    gen_throughput: float = 0.0
+    num_token: int = 0
+    token_usage: float = 0.0
+    waiting_queue: int = 0
+    time_e2e_requests: List[float] = field(default_factory=list)
+    time_waiting_requests: List[float] = field(default_factory=list)
+    time_decode_requests: List[float] = field(default_factory=list)
+    # system stats
+    token_usage: float = 0.0
+    is_mixed_chunk: bool = False
+    new_seq: int = 0
+    new_token: int = 0
+    cached_token: int = 0
+    cache_hit_rate: float = 0.0
+    running_req: int = 0
+    queue_req: int = 0
+
+
+# TODO Iteration stats
diff --git a/python/sglang/srt/server.py b/python/sglang/srt/server.py
index 3d3a0d4bc50..de57ca2d2b7 100644
--- a/python/sglang/srt/server.py
+++ b/python/sglang/srt/server.py
@@ -24,6 +24,8 @@
 import logging
 import multiprocessing as mp
 import os
+import re
+import tempfile
 import threading
 import time
 from http import HTTPStatus
@@ -39,6 +41,7 @@
 from fastapi import FastAPI, File, Form, Request, UploadFile
 from fastapi.middleware.cors import CORSMiddleware
 from fastapi.responses import JSONResponse, Response, StreamingResponse
+from starlette.routing import Mount
 
 from sglang.lang.backend.runtime_endpoint import RuntimeEndpoint
 from sglang.srt.constrained import disable_cache
@@ -90,6 +93,10 @@
 
 logger = logging.getLogger(__name__)
 
+# Temporary directory for prometheus multiprocess mode
+# Cleaned up automatically when this object is garbage collected
+prometheus_multiproc_dir: tempfile.TemporaryDirectory
+
 asyncio.set_event_loop_policy(uvloop.EventLoopPolicy())
 
 
@@ -296,6 +303,19 @@ async def retrieve_file_content(file_id: str):
     return await v1_retrieve_file_content(file_id)
 
 
+def add_prometheus_middleware(app: FastAPI):
+    # Adopted from https://github.com/vllm-project/vllm/blob/v0.6.1/vllm/entrypoints/openai/api_server.py#L216
+    from prometheus_client import CollectorRegistry, make_asgi_app, multiprocess
+
+    registry = CollectorRegistry()
+    multiprocess.MultiProcessCollector(registry)
+    metrics_route = Mount("/metrics", make_asgi_app(registry=registry))
+
+    # Workaround for 307 Redirect for /metrics
+    metrics_route.path_regex = re.compile("^/metrics(?P<path>.*)$")
+    app.routes.append(metrics_route)
+
+
 def launch_server(
     server_args: ServerArgs,
     pipe_finish_writer: Optional[mp.connection.Connection] = None,
@@ -397,6 +417,8 @@ def launch_server(
     if server_args.api_key:
         add_api_key_middleware(app, server_args.api_key)
 
+    add_prometheus_middleware(app)
+
     # Send a warmup request
     t = threading.Thread(
         target=_wait_and_warmup, args=(server_args, pipe_finish_writer, os.getpid())
@@ -425,6 +447,21 @@ def _set_envs_and_config(server_args: ServerArgs):
     os.environ["TORCH_NCCL_AVOID_RECORD_STREAMS"] = "1"
     os.environ["CUDA_DEVICE_MAX_CONNECTIONS"] = "1"
 
+    # Set prometheus multiprocess directory
+    # sglang uses prometheus multiprocess mode
+    # we need to set this before importing prometheus_client
+    # https://prometheus.github.io/client_python/multiprocess/
+    global prometheus_multiproc_dir
+    if "PROMETHEUS_MULTIPROC_DIR" in os.environ:
+        logger.debug(f"User set PROMETHEUS_MULTIPROC_DIR detected.")
+        prometheus_multiproc_dir = tempfile.TemporaryDirectory(
+            dir=os.environ["PROMETHEUS_MULTIPROC_DIR"]
+        )
+    else:
+        prometheus_multiproc_dir = tempfile.TemporaryDirectory()
+        os.environ["PROMETHEUS_MULTIPROC_DIR"] = prometheus_multiproc_dir.name
+    logger.debug(f"PROMETHEUS_MULTIPROC_DIR: {os.environ['PROMETHEUS_MULTIPROC_DIR']}")
+
     # Set ulimit
     set_ulimit()