diff --git a/examples/monitoring/docker-compose.yaml b/examples/monitoring/docker-compose.yaml new file mode 100644 index 00000000000..d121368cc85 --- /dev/null +++ b/examples/monitoring/docker-compose.yaml @@ -0,0 +1,16 @@ +services: + prometheus: + image: prom/prometheus:latest + network_mode: host + ports: + - "9090:9090" # the default port used by Prometheus + volumes: + - ${PWD}/prometheus.yaml:/etc/prometheus/prometheus.yml # mount Prometheus config file + + grafana: + image: grafana/grafana:latest + network_mode: host + depends_on: + - prometheus + ports: + - "3000:3000" # the default port used by Grafana \ No newline at end of file diff --git a/examples/monitoring/grafana.json b/examples/monitoring/grafana.json new file mode 100644 index 00000000000..76e0c7f5c31 --- /dev/null +++ b/examples/monitoring/grafana.json @@ -0,0 +1,1724 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": 1, + "links": [], + "panels": [ + { + "datasource": { + "default": true, + "type": "prometheus", + "uid": "ddyfngn31dg5cf" + }, + "description": "max-running-requests from server argument", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 3, + "w": 3, + "x": 0, + "y": 0 + }, + "id": 2, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "11.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "ddyfngn31dg5cf" + }, + "disableTextWrap": false, + "editorMode": "builder", + "expr": "sglang:max_running_requests{instance=\"$instance\", name=\"$model_name\"}", + "fullMetaSearch": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "Max Running Requests", + "type": "stat" + }, + { + "datasource": { + "default": true, + "type": "prometheus", + "uid": "ddyfngn31dg5cf" + }, + "description": "Supported context length with loaded model", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 3, + "w": 3, + "x": 3, + "y": 0 + }, + "id": 1, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "11.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "ddyfngn31dg5cf" + }, + "disableTextWrap": false, + "editorMode": "builder", + "expr": "sglang:context_len{instance=\"$instance\", name=\"$model_name\"}", + "fullMetaSearch": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "Max Context Length", + "type": "stat" + }, + { + "datasource": { + "default": true, + "type": "prometheus", + "uid": "ddyfngn31dg5cf" + }, + "description": "max_total_tokens", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 3, + "w": 3, + "x": 6, + "y": 0 + }, + "id": 4, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "11.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "ddyfngn31dg5cf" + }, + "disableTextWrap": false, + "editorMode": "builder", + "expr": "sglang:max_total_num_tokens{instance=\"$instance\", name=\"$model_name\"}", + "fullMetaSearch": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "Max Total Num Tokens", + "type": "stat" + }, + { + "datasource": { + "default": true, + "type": "prometheus", + "uid": "ddyfngn31dg5cf" + }, + "description": "max_prefill_tokens from server args", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 3, + "w": 3, + "x": 9, + "y": 0 + }, + "id": 3, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "11.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "ddyfngn31dg5cf" + }, + "disableTextWrap": false, + "editorMode": "builder", + "expr": "sglang:max_prefill_tokens{instance=\"$instance\", name=\"$model_name\"}", + "fullMetaSearch": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "Max Prefill Tokens", + "type": "stat" + }, + { + "datasource": { + "default": true, + "type": "prometheus", + "uid": "ddyfngn31dg5cf" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 3, + "w": 6, + "x": 12, + "y": 0 + }, + "id": 6, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "11.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "ddyfngn31dg5cf" + }, + "disableTextWrap": false, + "editorMode": "builder", + "expr": "sglang:cached_token{instance=\"$instance\", name=\"$model_name\"}", + "fullMetaSearch": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "{{__name__}}", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "Cached Tokens", + "type": "stat" + }, + { + "datasource": { + "default": true, + "type": "prometheus", + "uid": "ddyfngn31dg5cf" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 3, + "w": 6, + "x": 18, + "y": 0 + }, + "id": 5, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "11.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "ddyfngn31dg5cf" + }, + "disableTextWrap": false, + "editorMode": "code", + "expr": "sglang:cache_hit_rate{instance=\"$instance\", name=\"$model_name\"}", + "fullMetaSearch": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "{{__name__}}", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "Cache Hit Rate (%)", + "type": "stat" + }, + { + "datasource": { + "default": true, + "type": "prometheus", + "uid": "ddyfngn31dg5cf" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 3 + }, + "id": 14, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "ddyfngn31dg5cf" + }, + "disableTextWrap": false, + "editorMode": "code", + "expr": "histogram_quantile(0.99, sum by(le) (rate(sglang:e2e_request_latency_seconds_bucket{instance=\"$instance\", name=\"$model_name\"}[$__rate_interval])))", + "fullMetaSearch": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "P99", + "range": true, + "refId": "A", + "useBackend": false + }, + { + "datasource": { + "type": "prometheus", + "uid": "ddyfngn31dg5cf" + }, + "disableTextWrap": false, + "editorMode": "code", + "expr": "histogram_quantile(0.9, sum by(le) (rate(sglang:e2e_request_latency_seconds_bucket{instance=\"$instance\", name=\"$model_name\"}[$__rate_interval])))", + "fullMetaSearch": false, + "hide": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "P90", + "range": true, + "refId": "B", + "useBackend": false + }, + { + "datasource": { + "type": "prometheus", + "uid": "ddyfngn31dg5cf" + }, + "disableTextWrap": false, + "editorMode": "builder", + "expr": "histogram_quantile(0.95, sum by(le) (rate(sglang:e2e_request_latency_seconds_bucket{instance=\"$instance\", name=\"$model_name\"}[$__rate_interval])))", + "fullMetaSearch": false, + "hide": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "P95", + "range": true, + "refId": "C", + "useBackend": false + }, + { + "datasource": { + "type": "prometheus", + "uid": "ddyfngn31dg5cf" + }, + "disableTextWrap": false, + "editorMode": "builder", + "expr": "histogram_quantile(0.5, sum by(le) (rate(sglang:e2e_request_latency_seconds_bucket{instance=\"$instance\", name=\"$model_name\"}[$__rate_interval])))", + "fullMetaSearch": false, + "hide": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "P50", + "range": true, + "refId": "D", + "useBackend": false + }, + { + "datasource": { + "type": "prometheus", + "uid": "ddyfngn31dg5cf" + }, + "disableTextWrap": false, + "editorMode": "builder", + "expr": "rate(sglang:e2e_request_latency_seconds_sum{instance=\"$instance\", name=\"$model_name\"}[$__rate_interval]) / rate(sglang:e2e_request_latency_seconds_count[$__rate_interval])", + "fullMetaSearch": false, + "hide": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "Average", + "range": true, + "refId": "E", + "useBackend": false + } + ], + "title": "E2E Request Latency (S)", + "type": "timeseries" + }, + { + "datasource": { + "default": true, + "type": "prometheus", + "uid": "ddyfngn31dg5cf" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 3 + }, + "id": 18, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "ddyfngn31dg5cf" + }, + "editorMode": "code", + "expr": "sglang:gen_throughput", + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Generation Throughput (Token / S)", + "type": "timeseries" + }, + { + "datasource": { + "default": true, + "type": "prometheus", + "uid": "ddyfngn31dg5cf" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 11 + }, + "id": 7, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "ddyfngn31dg5cf" + }, + "disableTextWrap": false, + "editorMode": "builder", + "expr": "sglang:num_requests_running{instance=\"$instance\", name=\"$model_name\"}", + "fullMetaSearch": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "{{__name__}}", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "Num Requests Running", + "type": "timeseries" + }, + { + "datasource": { + "default": true, + "type": "prometheus", + "uid": "ddyfngn31dg5cf" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 11 + }, + "id": 8, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "ddyfngn31dg5cf" + }, + "disableTextWrap": false, + "editorMode": "builder", + "expr": "sglang:num_requests_waiting{instance=\"$instance\", name=\"$model_name\"}", + "fullMetaSearch": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "{{__name__}}", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "Number of Requests Waiting", + "type": "timeseries" + }, + { + "datasource": { + "default": true, + "type": "prometheus", + "uid": "ddyfngn31dg5cf" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 19 + }, + "id": 16, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "ddyfngn31dg5cf" + }, + "disableTextWrap": false, + "editorMode": "code", + "expr": "histogram_quantile(0.99, sum by(le) (rate(sglang:decode_request_latency_seconds_bucket{instance=\"$instance\", name=\"$model_name\"}[$__rate_interval])))", + "fullMetaSearch": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "P99", + "range": true, + "refId": "A", + "useBackend": false + }, + { + "datasource": { + "type": "prometheus", + "uid": "ddyfngn31dg5cf" + }, + "disableTextWrap": false, + "editorMode": "code", + "expr": "histogram_quantile(0.9, sum by(le) (rate(sglang:decode_request_latency_seconds_bucket{instance=\"$instance\", name=\"$model_name\"}[$__rate_interval])))", + "fullMetaSearch": false, + "hide": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "P90", + "range": true, + "refId": "B", + "useBackend": false + }, + { + "datasource": { + "type": "prometheus", + "uid": "ddyfngn31dg5cf" + }, + "disableTextWrap": false, + "editorMode": "code", + "expr": "histogram_quantile(0.95, sum by(le) (rate(sglang:decode_request_latency_seconds_bucket{instance=\"$instance\", name=\"$model_name\"}[$__rate_interval])))", + "fullMetaSearch": false, + "hide": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "P95", + "range": true, + "refId": "C", + "useBackend": false + }, + { + "datasource": { + "type": "prometheus", + "uid": "ddyfngn31dg5cf" + }, + "disableTextWrap": false, + "editorMode": "code", + "expr": "histogram_quantile(0.5, sum by(le) (rate(sglang:decode_request_latency_seconds_bucket{instance=\"$instance\", name=\"$model_name\"}[$__rate_interval])))", + "fullMetaSearch": false, + "hide": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "P50", + "range": true, + "refId": "D", + "useBackend": false + }, + { + "datasource": { + "type": "prometheus", + "uid": "ddyfngn31dg5cf" + }, + "disableTextWrap": false, + "editorMode": "code", + "expr": "rate(sglang:decode_request_latency_seconds_sum{instance=\"$instance\", name=\"$model_name\"}[$__rate_interval]) / rate(sglang:decode_request_latency_seconds_count[$__rate_interval])", + "fullMetaSearch": false, + "hide": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "Average", + "range": true, + "refId": "E", + "useBackend": false + } + ], + "title": "Time Request Decoding (S)", + "type": "timeseries" + }, + { + "datasource": { + "default": true, + "type": "prometheus", + "uid": "ddyfngn31dg5cf" + }, + "description": "Time requests waiting before added to batch", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 19 + }, + "id": 15, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "ddyfngn31dg5cf" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.99, sum by (le) (rate(sglang:waiting_request_latency_seconds_bucket[$__rate_interval])))", + "instant": false, + "legendFormat": "P99", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "ddyfngn31dg5cf" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.95, sum by (le) (rate(sglang:waiting_request_latency_seconds_bucket[$__rate_interval])))", + "hide": false, + "instant": false, + "legendFormat": "P95", + "range": true, + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "ddyfngn31dg5cf" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.9, sum by (le) (rate(sglang:waiting_request_latency_seconds_bucket[$__rate_interval])))", + "hide": false, + "instant": false, + "legendFormat": "P90", + "range": true, + "refId": "C" + }, + { + "datasource": { + "type": "prometheus", + "uid": "ddyfngn31dg5cf" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.5, sum by (le) (rate(sglang:waiting_request_latency_seconds_bucket[$__rate_interval])))", + "hide": false, + "instant": false, + "legendFormat": "P50", + "range": true, + "refId": "D" + }, + { + "datasource": { + "type": "prometheus", + "uid": "ddyfngn31dg5cf" + }, + "editorMode": "code", + "expr": "rate(sglang:waiting_request_latency_seconds_sum[$__rate_interval])\r\n/\r\nrate(sglang:waiting_request_latency_seconds_count[$__rate_interval])", + "hide": false, + "instant": false, + "legendFormat": "Average", + "range": true, + "refId": "E" + } + ], + "title": "Time Request Waiting (S)", + "type": "timeseries" + }, + { + "datasource": { + "default": true, + "type": "prometheus", + "uid": "ddyfngn31dg5cf" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 27 + }, + "id": 11, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "ddyfngn31dg5cf" + }, + "disableTextWrap": false, + "editorMode": "code", + "expr": "sum(rate(sglang:request_prompt_tokens_sum{instance=\"$instance\", name=\"$model_name\"}[$__rate_interval])) by (instance, name)", + "fullMetaSearch": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "{{__name__}}", + "range": true, + "refId": "A", + "useBackend": false + }, + { + "datasource": { + "type": "prometheus", + "uid": "ddyfngn31dg5cf" + }, + "disableTextWrap": false, + "editorMode": "code", + "expr": "", + "fullMetaSearch": false, + "hide": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "B", + "useBackend": false + } + ], + "title": "Prompt Tokens", + "type": "timeseries" + }, + { + "datasource": { + "default": true, + "type": "prometheus", + "uid": "ddyfngn31dg5cf" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 27 + }, + "id": 17, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "ddyfngn31dg5cf" + }, + "disableTextWrap": false, + "editorMode": "code", + "expr": "sum(rate(sglang:request_generation_tokens_sum{instance=\"$instance\", name=\"$model_name\"}[$__rate_interval])) by (instance, name)", + "fullMetaSearch": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "{{__name__}}", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "Generated Tokens", + "type": "timeseries" + }, + { + "datasource": { + "default": true, + "type": "prometheus", + "uid": "ddyfngn31dg5cf" + }, + "fieldConfig": { + "defaults": { + "custom": { + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "scaleDistribution": { + "type": "linear" + } + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 35 + }, + "id": 13, + "options": { + "calculate": false, + "calculation": { + "yBuckets": { + "scale": { + "log": 2, + "type": "log" + } + } + }, + "cellGap": 1, + "color": { + "exponent": 0.5, + "fill": "dark-orange", + "mode": "scheme", + "reverse": false, + "scale": "exponential", + "scheme": "Oranges", + "steps": 64 + }, + "exemplars": { + "color": "rgba(255,0,255,0.7)" + }, + "filterValues": { + "le": 1e-9 + }, + "legend": { + "show": true + }, + "rowsFrame": { + "layout": "auto" + }, + "tooltip": { + "mode": "single", + "showColorScale": false, + "yHistogram": false + }, + "yAxis": { + "axisPlacement": "left", + "reverse": false + } + }, + "pluginVersion": "11.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "ddyfngn31dg5cf" + }, + "disableTextWrap": false, + "editorMode": "code", + "expr": "sum by(le) (increase(sglang:request_prompt_tokens_bucket{name=\"$model_name\", instance=\"$instance\"}[$__rate_interval]))", + "fullMetaSearch": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "{{__name__}}", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "Request Prompt Tokens (WIP)", + "type": "heatmap" + }, + { + "datasource": { + "default": true, + "type": "prometheus", + "uid": "ddyfngn31dg5cf" + }, + "description": "", + "fieldConfig": { + "defaults": { + "custom": { + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "scaleDistribution": { + "type": "linear" + } + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 35 + }, + "id": 12, + "options": { + "calculate": false, + "calculation": { + "xBuckets": { + "mode": "size", + "value": "" + }, + "yBuckets": { + "mode": "size", + "scale": { + "log": 2, + "type": "log" + }, + "value": "" + } + }, + "cellGap": 1, + "color": { + "exponent": 0.5, + "fill": "dark-orange", + "min": 0, + "mode": "scheme", + "reverse": false, + "scale": "exponential", + "scheme": "Spectral", + "steps": 64 + }, + "exemplars": { + "color": "rgba(255,0,255,0.7)" + }, + "filterValues": { + "le": 1e-9 + }, + "legend": { + "show": true + }, + "rowsFrame": { + "layout": "auto", + "value": "Request count" + }, + "tooltip": { + "mode": "single", + "showColorScale": false, + "yHistogram": true + }, + "yAxis": { + "axisLabel": "Generation Length", + "axisPlacement": "left", + "reverse": false, + "unit": "none" + } + }, + "pluginVersion": "11.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "ddyfngn31dg5cf" + }, + "disableTextWrap": false, + "editorMode": "code", + "expr": "sum by(le) (increase(sglang:request_generation_tokens_bucket{name=\"$model_name\", instance=\"$instance\"}[$__rate_interval]))", + "fullMetaSearch": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "{{__name__}}", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "Request Generation Tokens (WIP)", + "type": "heatmap" + } + ], + "refresh": "5s", + "schemaVersion": 39, + "tags": [], + "templating": { + "list": [ + { + "current": { + "selected": false, + "text": "127.0.0.1:30000", + "value": "127.0.0.1:30000" + }, + "datasource": { + "type": "prometheus", + "uid": "ddyfngn31dg5cf" + }, + "definition": "label_values(instance)", + "hide": 0, + "includeAll": false, + "label": "instance", + "multi": false, + "name": "instance", + "options": [], + "query": { + "qryType": 1, + "query": "label_values(instance)", + "refId": "PrometheusVariableQueryEditor-VariableQuery" + }, + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "type": "query" + }, + { + "current": { + "selected": false, + "text": "google/gemma-2-9b-it", + "value": "google/gemma-2-9b-it" + }, + "datasource": { + "type": "prometheus", + "uid": "ddyfngn31dg5cf" + }, + "definition": "label_values(name)", + "hide": 0, + "includeAll": false, + "label": "name", + "multi": false, + "name": "model_name", + "options": [], + "query": { + "qryType": 1, + "query": "label_values(name)", + "refId": "PrometheusVariableQueryEditor-VariableQuery" + }, + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "type": "query" + } + ] + }, + "time": { + "from": "now-5m", + "to": "now" + }, + "timepicker": {}, + "timezone": "browser", + "title": "SGLang Dashboard", + "uid": "ddyp55uq7brpcc", + "version": 12, + "weekStart": "" +} \ No newline at end of file diff --git a/examples/monitoring/prometheus.yaml b/examples/monitoring/prometheus.yaml new file mode 100644 index 00000000000..ba16ac3bd30 --- /dev/null +++ b/examples/monitoring/prometheus.yaml @@ -0,0 +1,10 @@ +# prometheus.yaml +global: + scrape_interval: 5s + evaluation_interval: 30s + +scrape_configs: + - job_name: sglang + static_configs: + - targets: + - '127.0.0.1:30000' diff --git a/python/pyproject.toml b/python/pyproject.toml index 69bfb7ca4a2..136d0fc205f 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -23,7 +23,9 @@ dependencies = [ srt = ["aiohttp", "decord", "fastapi", "hf_transfer", "huggingface_hub", "interegular", "packaging", "pillow", "psutil", "pydantic", "python-multipart", "torch", "torchao", "uvicorn", "uvloop", "zmq", - "vllm==0.5.5", "outlines>=0.0.44", "modelscope"] + "vllm==0.5.5", "outlines>=0.0.44", "modelscope", + "prometheus-client>=0.20.0"] + openai = ["openai>=1.0", "tiktoken"] anthropic = ["anthropic>=0.20.0"] litellm = ["litellm>=1.0.0"] diff --git a/python/sglang/srt/managers/policy_scheduler.py b/python/sglang/srt/managers/policy_scheduler.py index ada3904182c..2f7a881b7d7 100644 --- a/python/sglang/srt/managers/policy_scheduler.py +++ b/python/sglang/srt/managers/policy_scheduler.py @@ -17,6 +17,7 @@ import os import random +import time from collections import defaultdict from contextlib import contextmanager from typing import Dict, List, Optional @@ -286,6 +287,7 @@ def add_one_req(self, req: Req): or (req.return_logprob and req.normalized_prompt_logprob is None) ): # Non-chunked prefill + req.queued_time = time.time() self.can_run_list.append(req) self.tree_cache.inc_lock_ref(req.last_node) self._prefill_one_req( @@ -301,6 +303,7 @@ def add_one_req(self, req: Req): req.extend_input_len = trunc_len req.fill_ids = req.fill_ids[: len(req.prefix_indices) + trunc_len] + req.queued_time = time.time() self.can_run_list.append(req) self.new_inflight_req = req self.tree_cache.inc_lock_ref(req.last_node) diff --git a/python/sglang/srt/managers/schedule_batch.py b/python/sglang/srt/managers/schedule_batch.py index c4c91c7112e..23dcc8b659b 100644 --- a/python/sglang/srt/managers/schedule_batch.py +++ b/python/sglang/srt/managers/schedule_batch.py @@ -18,6 +18,7 @@ """Meta data for requests and batches""" import logging +import time from dataclasses import dataclass from typing import List, Optional, Tuple, Union @@ -189,6 +190,16 @@ def __init__( self.regex_fsm_state: int = 0 self.jump_forward_map: JumpForwardMap = None + # Lifetime traces + # time when request is created and added to waitlist + self.created_time = None + # time when request is added to prefill batech + self.queued_time = None + # time when request is being processed + self.started_time = None + # time when request is finished + self.finished_time = None + # whether request reached finished condition def finished(self) -> bool: return self.finished_reason is not None @@ -262,6 +273,7 @@ def check_finished(self): return if len(self.output_ids) >= self.sampling_params.max_new_tokens: + self.finished_time = time.time() self.finished_reason = FINISH_LENGTH( length=self.sampling_params.max_new_tokens ) @@ -275,6 +287,7 @@ def check_finished(self): matched_eos |= last_token_id == self.tokenizer.eos_token_id if matched_eos and not self.sampling_params.ignore_eos: + self.finished_time = time.time() self.finished_reason = FINISH_MATCHED_TOKEN(matched=last_token_id) return @@ -285,6 +298,7 @@ def check_finished(self): for stop_str in self.sampling_params.stop_strs: if stop_str in tail_str or stop_str in self.decoded_text: + self.finished_time = time.time() self.finished_reason = FINISH_MATCHED_STR(matched=stop_str) return @@ -426,6 +440,10 @@ def alloc_token_slots(self, num_tokens: int): return out_cache_loc + def mark_reqs_started(self): + for req in self.reqs: + req.started_time = time.time() + def prepare_for_extend(self, vocab_size: int): self.forward_mode = ForwardMode.EXTEND diff --git a/python/sglang/srt/managers/tp_worker.py b/python/sglang/srt/managers/tp_worker.py index b9690670073..c174bf31e00 100644 --- a/python/sglang/srt/managers/tp_worker.py +++ b/python/sglang/srt/managers/tp_worker.py @@ -54,6 +54,7 @@ ) from sglang.srt.mem_cache.chunk_cache import ChunkCache from sglang.srt.mem_cache.radix_cache import RadixCache +from sglang.srt.metrics.metrics_types import Stats from sglang.srt.model_executor.model_runner import ModelRunner from sglang.srt.server_args import ServerArgs from sglang.srt.utils import ( @@ -138,6 +139,25 @@ def __init__( self.max_total_num_tokens - 1, ) + # Setup Metrics and Collectors + self.time_e2e_requests: List[float] = [] + self.time_waiting_requests: List[float] = [] + self.time_decode_requests: List[float] = [] + self._stats = Stats() + self._stats.max_total_num_tokens = self.max_total_num_tokens + self._stats.max_prefill_tokens = self.max_prefill_tokens + self._stats.max_running_requests = self.max_running_requests + self._stats.context_len = self.model_config.context_len + # Lazy loading to ensure prometheus is initialized + from sglang.srt.metrics.metrics_collector import PrometheusMetricsCollector + + self.metrics_collector = PrometheusMetricsCollector( + labels={ + "name": self.model_config.path, + # TODO: Add lora name/path in the future, + }, + max_model_len=self.max_total_num_tokens, + ) # Sync random seed across TP workers server_args.random_seed = broadcast_recv_input( [server_args.random_seed], @@ -241,6 +261,10 @@ def exposed_step(self, recv_reqs: List): # Forward self.forward_step() + # log stats + if self.tp_rank == 0: + self.log_step_metrics() + except Exception: logger.error("Exception in ModelTpServer:\n" + get_exception_traceback()) raise @@ -304,6 +328,49 @@ def print_decode_stats(self): f"#queue-req: {len(self.waiting_queue)}" ) + def log_step_metrics(self): + + self._stats.time_e2e_requests = self.time_e2e_requests + self._stats.time_waiting_requests = self.time_waiting_requests + self._stats.time_decode_requests = self.time_decode_requests + + if self.running_batch: + num_used = self.max_total_num_tokens - ( + self.token_to_kv_pool.available_size() + + self.tree_cache.evictable_size() + ) + token_usage = num_used / self.max_total_num_tokens + throughput = self.num_generated_tokens / (time.time() - self.last_stats_tic) + self.num_generated_tokens = 0 + self.last_stats_tic = time.time() + + self._stats.num_running_req = len(self.running_batch.reqs) + self._stats.num_waiting_req = len(self.waiting_queue) + self._stats.gen_throughput = throughput + self._stats.num_token = num_used + self._stats.token_usage = token_usage + else: + self._stats.num_running_req = 0 + self._stats.num_waiting_req = 0 + self._stats.gen_throughput = 0.0 + self._stats.num_token = 0 + self._stats.token_usage = 0.0 + self._stats.waiting_queue = 0 + self._stats.cache_hit_rate = 0.0 + + if self.tree_cache_metrics["total"] > 0: + self._stats.cache_hit_rate = 100.0 * ( + self.tree_cache_metrics["hit"] / self.tree_cache_metrics["total"] + ) + else: + self._stats.cache_hit_rate = 0.0 + + self.time_e2e_requests = [] + self.time_waiting_requests = [] + self.time_decode_requests = [] + + self.log_metrics(self._stats) + def check_memory(self): available_size = ( self.token_to_kv_pool.available_size() + self.tree_cache.evictable_size() @@ -405,7 +472,8 @@ def handle_generate_request( ), self.max_req_input_len - 1 - len(req.origin_input_ids), ) - + # TODO: add created time + req.created_time = time.time() self.waiting_queue.append(req) def handle_embedding_request( @@ -424,6 +492,7 @@ def handle_embedding_request( ) req.origin_input_ids = req.origin_input_ids[: self.max_req_input_len] + req.created_time = time.time() self.waiting_queue.append(req) def get_new_prefill_batch(self) -> Optional[ScheduleBatch]: @@ -508,6 +577,9 @@ def get_new_prefill_batch(self) -> Optional[ScheduleBatch]: else: tree_cache_hit_rate = 0.0 + running_req = ( + num_mixed_running if self.is_mixed_chunk == True else running_bs + ) num_used = self.max_total_num_tokens - ( self.token_to_kv_pool.available_size() + self.tree_cache.evictable_size() @@ -536,6 +608,18 @@ def get_new_prefill_batch(self) -> Optional[ScheduleBatch]: f"#queue-req: {len(self.waiting_queue) - len(can_run_list) + has_inflight}" ) + self._stats.is_mixed_chunk = self.is_mixed_chunk + self._stats.new_seq = len(can_run_list) + self._stats.new_token = adder.log_input_tokens + self._stats.cached_token = adder.log_hit_tokens + self._stats.cache_hit_rate = 100.0 * tree_cache_hit_rate + self._stats.running_req = running_req + self._stats.queue_req = ( + len(self.waiting_queue) - len(can_run_list) + has_inflight + ) + self._stats.token_usage = num_used / self.max_total_num_tokens + self.log_metrics(self._stats) + # Return the new batch new_batch = ScheduleBatch.init_new( can_run_list, @@ -549,6 +633,7 @@ def get_new_prefill_batch(self) -> Optional[ScheduleBatch]: def forward_prefill_batch(self, batch: ScheduleBatch): # Build batch tensors batch.prepare_for_extend(self.model_config.vocab_size) + batch.mark_reqs_started() decoding_reqs = [] if self.is_mixed_chunk and self.running_batch is not None: @@ -800,6 +885,9 @@ def handle_finished_requests(self, batch: ScheduleBatch): output_rids = [] output_meta_info = [] output_finished_reason: List[BaseFinishReason] = [] + num_prompt_tokens_requests: List[int] = [] + num_generation_tokens_requests: List[int] = [] + finished_reason_requests: List[str] = [] if self.model_runner.is_generation: output_vids = [] decoded_texts = [] @@ -822,6 +910,9 @@ def handle_finished_requests(self, batch: ScheduleBatch): or len(req.output_ids) == 1 ) ): + self.time_e2e_requests.append(req.finished_time - req.created_time) + self.time_waiting_requests.append(req.queued_time - req.created_time) + self.time_decode_requests.append(req.finished_time - req.started_time) output_rids.append(req.rid) output_finished_reason.append(req.finished_reason) if self.model_runner.is_generation: @@ -837,6 +928,16 @@ def handle_finished_requests(self, batch: ScheduleBatch): req.sampling_params.spaces_between_special_tokens ) + num_prompt_tokens_requests.append(len(req.origin_input_ids)) + num_generation_tokens_requests.append(len(req.output_ids)) + finished_reason_requests.append( + ( + req.finished_reason.to_json() + if req.finished_reason is not None + else None + ) + ) + meta_info = { "prompt_tokens": len(req.origin_input_ids), "completion_tokens": len(req.output_ids), @@ -868,6 +969,12 @@ def handle_finished_requests(self, batch: ScheduleBatch): "prompt_tokens": len(req.origin_input_ids), } output_meta_info.append(meta_info) + stats = Stats( + num_generation_tokens_requests=num_generation_tokens_requests, + num_prompt_tokens_requests=num_prompt_tokens_requests, + finished_reason_requests=finished_reason_requests, + ) + self.log_metrics(stats) # Send to detokenizer if output_rids: @@ -948,6 +1055,11 @@ def update_weights(self, recv_req): logger.error(message) return success, message + def log_metrics(self, stats): + """Collect metrics.""" + + self.metrics_collector.log_stats(stats) + def run_tp_server( gpu_id: int, diff --git a/python/sglang/srt/metrics/metrics_collector.py b/python/sglang/srt/metrics/metrics_collector.py new file mode 100644 index 00000000000..9a06b13864a --- /dev/null +++ b/python/sglang/srt/metrics/metrics_collector.py @@ -0,0 +1,261 @@ +""" +Copyright 2023-2024 SGLang Team +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +"""Utilities for Prometheus Metrics Collection.""" + +import logging +from abc import ABC, abstractmethod +from typing import Counter as CollectionsCounter +from typing import Dict, List, Union + +import numpy as np +from prometheus_client import Counter, Gauge, Histogram + +from sglang.srt.metrics.metrics_types import Stats + + +class Metrics: + """ + SGLang Metrics + """ + + def __init__(self, labelnames: List[str], max_model_len): + + # Configuration Stats + self.max_total_num_tokens = Gauge( + name="sglang:max_total_num_tokens", + documentation="Maximum total number of tokens", + labelnames=labelnames, + multiprocess_mode="min", + ) # static across processes + + self.max_prefill_tokens = Gauge( + name="sglang:max_prefill_tokens", + documentation="Maximum prefill tokens", + labelnames=labelnames, + multiprocess_mode="min", + ) # static across processes + + self.max_running_requests = Gauge( + name="sglang:max_running_requests", + documentation="Maximum running requests", + labelnames=labelnames, + multiprocess_mode="min", + ) # static across processes + + self.context_len = Gauge( + name="sglang:context_len", + documentation="Context length", + labelnames=labelnames, + multiprocess_mode="min", + ) # static across processes + # Decode Stats + self.num_running_sys = Gauge( + name="sglang:num_requests_running", + documentation="Number of requests currently running on GPU", + labelnames=labelnames, + multiprocess_mode="sum", + ) + self.num_waiting_sys = Gauge( + name="sglang:num_requests_waiting", + documentation="Number of requests waiting to be processed.", + labelnames=labelnames, + multiprocess_mode="sum", + ) + self.gen_throughput = Gauge( + name="sglang:gen_throughput", + documentation="Gen token throughput (token/s)", + labelnames=labelnames, + multiprocess_mode="sum", + ) + self.token_usage = Gauge( + name="sglang:token_usage", + documentation="Total token usage", + labelnames=labelnames, + multiprocess_mode="sum", + ) + # System Stats + # KV Cache Usage in % + # self.gpu_cache_usage_sys = Gauge( + # "gpu_cache_usage_perc", + # "GPU KV-cache usage. 1 means 100 percent usage.", + # labelnames=labelnames, + # multiprocess_mode="sum") + + self.new_seq = Gauge( + name="sglang:new_seq", + documentation="Number of new sequences", + labelnames=labelnames, + multiprocess_mode="sum", + ) + self.new_token = Gauge( + name="sglang:new_token", + documentation="Number of new token", + labelnames=labelnames, + multiprocess_mode="sum", + ) + # Prefix caching block hit rate + self.cached_token = Gauge( + name="sglang:cached_token", + documentation="Number of cached token", + labelnames=labelnames, + multiprocess_mode="sum", + ) + self.cache_hit_rate = Gauge( + name="sglang:cache_hit_rate", + documentation="Cache hit rate", + labelnames=labelnames, + multiprocess_mode="sum", + ) + self.queue_req = Gauge( + name="sglang:queue_req", + documentation="Number of queued requests", + labelnames=labelnames, + multiprocess_mode="sum", + ) + + # Request Stats + # Metadata + self.num_prompt_tokens_requests = Histogram( + name="sglang:request_prompt_tokens", + documentation="Number of prefill tokens processed", + labelnames=labelnames, + buckets=build_1_2_5_buckets(max_model_len), + ) + self.num_generation_tokens_requests = Histogram( + name="sglang:request_generation_tokens", + documentation="Number of generation tokens processed.", + labelnames=labelnames, + buckets=build_1_2_5_buckets(max_model_len), + ) + self.finished_reason_requests = Counter( + name="sglang:request_success_total", + documentation="Count of successfully processed requests.", + labelnames=labelnames + ["finished_reason"], + ) + self.histogram_time_e2e_requests = Histogram( + name="sglang:e2e_request_latency_seconds", + documentation="Histogram of End-to-end request latency in seconds", + labelnames=labelnames, + buckets=build_1_2_5_buckets(max_model_len), + ) + self.histogram_time_waiting_requests = Histogram( + name="sglang:waiting_request_latency_seconds", + documentation="Histogram of request waiting time in seconds", + labelnames=labelnames, + buckets=build_1_2_5_buckets(max_model_len), + ) + self.histogram_time_decode_requests = Histogram( + name="sglang:decode_request_latency_seconds", + documentation="Histogram of request decoding time in seconds", + labelnames=labelnames, + buckets=build_1_2_5_buckets(max_model_len), + ) + + +class MetricsCollector(ABC): + """ + SGLang Metrics Collector + """ + + @abstractmethod + def log_stats(self, stats: Stats) -> None: + pass + + +class PrometheusMetricsCollector(MetricsCollector): + """ + SGLang Metrics Collector + """ + + def __init__(self, labels: Dict[str, str], max_model_len: int) -> None: + self.labels = labels + self.metrics = Metrics( + labelnames=list(labels.keys()), max_model_len=max_model_len + ) + + def _log_gauge(self, gauge, data: Union[int, float]) -> None: + # Convenience function for logging to gauge. + gauge.labels(**self.labels).set(data) + + def _log_counter(self, counter, data: Union[int, float]) -> None: + # Convenience function for logging to counter. + counter.labels(**self.labels).inc(data) + + def _log_counter_labels( + self, counter, data: CollectionsCounter, label_key: str + ) -> None: + # Convenience function for collection counter of labels. + for label, count in data.items(): + counter.labels(**{**self.labels, label_key: label}).inc(count) + + def _log_histogram(self, histogram, data: Union[List[int], List[float]]) -> None: + # Convenience function for logging list to histogram. + for datum in data: + histogram.labels(**self.labels).observe(datum) + + def log_stats(self, stats: Stats) -> None: + self._log_gauge(self.metrics.max_total_num_tokens, stats.max_total_num_tokens) + self._log_gauge(self.metrics.max_prefill_tokens, stats.max_prefill_tokens) + self._log_gauge(self.metrics.max_running_requests, stats.max_running_requests) + self._log_gauge(self.metrics.context_len, stats.context_len) + self._log_histogram( + self.metrics.num_prompt_tokens_requests, stats.num_prompt_tokens_requests + ) + self._log_histogram( + self.metrics.num_generation_tokens_requests, + stats.num_generation_tokens_requests, + ) + # self._log_gauge(self.metrics.gpu_cache_usage_sys, stats.gpu_cache_usage_sys) + self._log_gauge(self.metrics.num_running_sys, stats.num_running_req) + self._log_gauge(self.metrics.num_waiting_sys, stats.num_waiting_req) + self._log_gauge(self.metrics.gen_throughput, stats.gen_throughput) + self._log_gauge(self.metrics.token_usage, stats.token_usage) + self._log_histogram( + self.metrics.histogram_time_e2e_requests, stats.time_e2e_requests + ) + self._log_histogram( + self.metrics.histogram_time_waiting_requests, stats.time_waiting_requests + ) + self._log_histogram( + self.metrics.histogram_time_decode_requests, stats.time_decode_requests + ) + self._log_gauge(self.metrics.new_seq, stats.new_seq) + self._log_gauge(self.metrics.new_token, stats.new_token) + self._log_gauge(self.metrics.cached_token, stats.cached_token) + self._log_gauge(self.metrics.cache_hit_rate, stats.cache_hit_rate) + self._log_gauge(self.metrics.queue_req, stats.queue_req) + + +def build_1_2_5_buckets(max_value: int) -> List[int]: + """ + Builds a list of buckets with increasing powers of 10 multiplied by + mantissa values (1, 2, 5) until the value exceeds the specified maximum. + + Example: + >>> build_1_2_5_buckets(100) + [1, 2, 5, 10, 20, 50, 100] + """ + mantissa_lst = [1, 2, 5] + exponent = 0 + buckets: List[int] = [] + while True: + for m in mantissa_lst: + value = m * 10**exponent + if value <= max_value: + buckets.append(value) + else: + return buckets + exponent += 1 diff --git a/python/sglang/srt/metrics/metrics_types.py b/python/sglang/srt/metrics/metrics_types.py new file mode 100644 index 00000000000..adddd6c2f12 --- /dev/null +++ b/python/sglang/srt/metrics/metrics_types.py @@ -0,0 +1,54 @@ +""" +Copyright 2023-2024 SGLang Team +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +"""Metrics Types""" + +from dataclasses import dataclass, field +from typing import List + + +@dataclass +class Stats: + # config + max_total_num_tokens: int = 0 + max_prefill_tokens: int = 0 + max_running_requests: int = 0 + context_len: int = 0 + # request stats + num_prompt_tokens_requests: List[int] = field(default_factory=list) + num_generation_tokens_requests: List[int] = field(default_factory=list) + finished_reason_requests: List[str] = field(default_factory=list) + # decode stats + num_running_req: int = 0 + num_waiting_req: int = 0 + gen_throughput: float = 0.0 + num_token: int = 0 + token_usage: float = 0.0 + waiting_queue: int = 0 + time_e2e_requests: List[float] = field(default_factory=list) + time_waiting_requests: List[float] = field(default_factory=list) + time_decode_requests: List[float] = field(default_factory=list) + # system stats + token_usage: float = 0.0 + is_mixed_chunk: bool = False + new_seq: int = 0 + new_token: int = 0 + cached_token: int = 0 + cache_hit_rate: float = 0.0 + running_req: int = 0 + queue_req: int = 0 + + +# TODO Iteration stats diff --git a/python/sglang/srt/server.py b/python/sglang/srt/server.py index 3d3a0d4bc50..de57ca2d2b7 100644 --- a/python/sglang/srt/server.py +++ b/python/sglang/srt/server.py @@ -24,6 +24,8 @@ import logging import multiprocessing as mp import os +import re +import tempfile import threading import time from http import HTTPStatus @@ -39,6 +41,7 @@ from fastapi import FastAPI, File, Form, Request, UploadFile from fastapi.middleware.cors import CORSMiddleware from fastapi.responses import JSONResponse, Response, StreamingResponse +from starlette.routing import Mount from sglang.lang.backend.runtime_endpoint import RuntimeEndpoint from sglang.srt.constrained import disable_cache @@ -90,6 +93,10 @@ logger = logging.getLogger(__name__) +# Temporary directory for prometheus multiprocess mode +# Cleaned up automatically when this object is garbage collected +prometheus_multiproc_dir: tempfile.TemporaryDirectory + asyncio.set_event_loop_policy(uvloop.EventLoopPolicy()) @@ -296,6 +303,19 @@ async def retrieve_file_content(file_id: str): return await v1_retrieve_file_content(file_id) +def add_prometheus_middleware(app: FastAPI): + # Adopted from https://github.com/vllm-project/vllm/blob/v0.6.1/vllm/entrypoints/openai/api_server.py#L216 + from prometheus_client import CollectorRegistry, make_asgi_app, multiprocess + + registry = CollectorRegistry() + multiprocess.MultiProcessCollector(registry) + metrics_route = Mount("/metrics", make_asgi_app(registry=registry)) + + # Workaround for 307 Redirect for /metrics + metrics_route.path_regex = re.compile("^/metrics(?P.*)$") + app.routes.append(metrics_route) + + def launch_server( server_args: ServerArgs, pipe_finish_writer: Optional[mp.connection.Connection] = None, @@ -397,6 +417,8 @@ def launch_server( if server_args.api_key: add_api_key_middleware(app, server_args.api_key) + add_prometheus_middleware(app) + # Send a warmup request t = threading.Thread( target=_wait_and_warmup, args=(server_args, pipe_finish_writer, os.getpid()) @@ -425,6 +447,21 @@ def _set_envs_and_config(server_args: ServerArgs): os.environ["TORCH_NCCL_AVOID_RECORD_STREAMS"] = "1" os.environ["CUDA_DEVICE_MAX_CONNECTIONS"] = "1" + # Set prometheus multiprocess directory + # sglang uses prometheus multiprocess mode + # we need to set this before importing prometheus_client + # https://prometheus.github.io/client_python/multiprocess/ + global prometheus_multiproc_dir + if "PROMETHEUS_MULTIPROC_DIR" in os.environ: + logger.debug(f"User set PROMETHEUS_MULTIPROC_DIR detected.") + prometheus_multiproc_dir = tempfile.TemporaryDirectory( + dir=os.environ["PROMETHEUS_MULTIPROC_DIR"] + ) + else: + prometheus_multiproc_dir = tempfile.TemporaryDirectory() + os.environ["PROMETHEUS_MULTIPROC_DIR"] = prometheus_multiproc_dir.name + logger.debug(f"PROMETHEUS_MULTIPROC_DIR: {os.environ['PROMETHEUS_MULTIPROC_DIR']}") + # Set ulimit set_ulimit()