Skip to content

Commit

Permalink
[Dashboard] Add instance variables to many default dashboard graphs (r…
Browse files Browse the repository at this point in the history
…ay-project#49174)

Signed-off-by: Alan Guo <aguo@anyscale.com>
Signed-off-by: ujjawal-khare <ujjawal.khare@dream11.com>
  • Loading branch information
alanwguo authored and ujjawal-khare committed Dec 17, 2024
1 parent e75db28 commit aa3986a
Show file tree
Hide file tree
Showing 2 changed files with 40 additions and 44 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -38,11 +38,11 @@ def max_plus_pending(max_resource, pending_resource):
unit="tasks",
targets=[
Target(
expr='sum(max_over_time(ray_tasks{{IsRetry="0",State=~"FINISHED|FAILED",{global_filters}}}[14d])) by (State) or clamp_min(sum(ray_tasks{{IsRetry="0",State!~"FINISHED|FAILED",{global_filters}}}) by (State), 0)',
expr='sum(max_over_time(ray_tasks{{IsRetry="0",State=~"FINISHED|FAILED",instance=~"$Instance",{global_filters}}}[14d])) by (State) or clamp_min(sum(ray_tasks{{IsRetry="0",State!~"FINISHED|FAILED",instance=~"$Instance",{global_filters}}}) by (State), 0)',
legend="{{State}}",
),
Target(
expr='sum(max_over_time(ray_tasks{{IsRetry!="0",State=~"FINISHED|FAILED",{global_filters}}}[14d])) by (State) or clamp_min(sum(ray_tasks{{IsRetry!="0",State!~"FINISHED|FAILED",{global_filters}}}) by (State), 0)',
expr='sum(max_over_time(ray_tasks{{IsRetry!="0",State=~"FINISHED|FAILED",instance=~"$Instance",{global_filters}}}[14d])) by (State) or clamp_min(sum(ray_tasks{{IsRetry!="0",State!~"FINISHED|FAILED",instance=~"$Instance",{global_filters}}}) by (State), 0)',
legend="{{State}} (retry)",
),
],
Expand All @@ -56,11 +56,11 @@ def max_plus_pending(max_resource, pending_resource):
unit="tasks",
targets=[
Target(
expr='sum(ray_tasks{{IsRetry="0",State!~"FINISHED|FAILED",{global_filters}}}) by (Name)',
expr='clamp_min(sum(ray_tasks{{IsRetry="0",State!~"FINISHED|FAILED",instance=~"$Instance",{global_filters}}}) by (Name), 0)',
legend="{{Name}}",
),
Target(
expr='sum(ray_tasks{{IsRetry!="0",State!~"FINISHED|FAILED",{global_filters}}}) by (Name)',
expr='clamp_min(sum(ray_tasks{{IsRetry!="0",State!~"FINISHED|FAILED",instance=~"$Instance",{global_filters}}}) by (Name), 0)',
legend="{{Name}} (retry)",
),
],
Expand All @@ -74,11 +74,11 @@ def max_plus_pending(max_resource, pending_resource):
unit="tasks",
targets=[
Target(
expr='sum(ray_tasks{{IsRetry="0",State=~"RUNNING*",{global_filters}}}) by (Name)',
expr='clamp_min(sum(ray_tasks{{IsRetry="0",State=~"RUNNING*",instance=~"$Instance",{global_filters}}}) by (Name), 0)',
legend="{{Name}}",
),
Target(
expr='sum(ray_tasks{{IsRetry!="0",State=~"RUNNING*",{global_filters}}}) by (Name)',
expr='clamp_min(sum(ray_tasks{{IsRetry!="0",State=~"RUNNING*",instance=~"$Instance",{global_filters}}}) by (Name), 0)',
legend="{{Name}} (retry)",
),
],
Expand All @@ -88,7 +88,7 @@ def max_plus_pending(max_resource, pending_resource):
Panel(
id=33,
title="Scheduler Actor State",
description="Current number of actors in a particular state.\n\nState: the actor state, as described by rpc::ActorTableData proto in gcs.proto.",
description='Note: not impacted by "Instance" variable.\n\nCurrent number of actors in a particular state.\n\nState: the actor state, as described by rpc::ActorTableData proto in gcs.proto.',
unit="actors",
targets=[
Target(
Expand All @@ -99,24 +99,24 @@ def max_plus_pending(max_resource, pending_resource):
),
Panel(
id=42,
title="Alive Actor State",
title="Live Actor State",
description="Current number of alive actors in a particular state.\n\nState: IDLE, RUNNING_TASK, RUNNING_IN_RAY_GET, RUNNING_IN_RAY_WAIT",
unit="actors",
targets=[
Target(
expr='sum(ray_actors{{Source="executor",{global_filters}}}) by (State)',
expr='sum(ray_actors{{Source="executor",NodeAddress=~"$Instance",{global_filters}}}) by (State)',
legend="{{State}}",
)
],
),
Panel(
id=36,
title="Requested Live Actors by Name",
description="Current number of (live) actors with a particular name.",
title="Live Actors by Name",
description="Current number of alive actors with a particular name.",
unit="actors",
targets=[
Target(
expr='sum(ray_actors{{State!="DEAD",Source="gcs",{global_filters}}}) by (Name)',
expr='sum(ray_actors{{State!="DEAD",Source="executor",NodeAddress=~"$Instance",{global_filters}}}) by (Name)',
legend="{{Name}}",
)
],
Expand All @@ -128,11 +128,11 @@ def max_plus_pending(max_resource, pending_resource):
unit="cores",
targets=[
Target(
expr='sum(ray_resources{{Name="CPU",State="USED",{global_filters}}}) by (instance)',
expr='sum(ray_resources{{Name="CPU",State="USED",instance=~"$Instance",{global_filters}}}) by (instance)',
legend="CPU Usage: {{instance}}",
),
Target(
expr='sum(ray_resources{{Name="CPU",{global_filters}}})',
expr='sum(ray_resources{{Name="CPU",instance=~"$Instance",{global_filters}}})',
legend="MAX",
),
# If max + pending > max, we display this value.
Expand All @@ -150,11 +150,11 @@ def max_plus_pending(max_resource, pending_resource):
unit="bytes",
targets=[
Target(
expr="sum(ray_object_store_memory{{{global_filters}}}) by (Location)",
expr='sum(ray_object_store_memory{{instance=~"$Instance",{global_filters}}}) by (Location)',
legend="{{Location}}",
),
Target(
expr='sum(ray_resources{{Name="object_store_memory",{global_filters}}})',
expr='sum(ray_resources{{Name="object_store_memory",instance=~"$Instance",{global_filters}}})',
legend="MAX",
),
],
Expand All @@ -166,11 +166,11 @@ def max_plus_pending(max_resource, pending_resource):
unit="GPUs",
targets=[
Target(
expr='ray_resources{{Name="GPU",State="USED",{global_filters}}}',
expr='ray_resources{{Name="GPU",State="USED",instance=~"$Instance",{global_filters}}}',
legend="GPU Usage: {{instance}}",
),
Target(
expr='sum(ray_resources{{Name="GPU",{global_filters}}})',
expr='sum(ray_resources{{Name="GPU",instance=~"$Instance",{global_filters}}})',
legend="MAX",
),
# If max + pending > max, we display this value.
Expand All @@ -184,7 +184,7 @@ def max_plus_pending(max_resource, pending_resource):
Panel(
id=40,
title="Scheduler Placement Groups",
description="Current number of placement groups in a particular state.\n\nState: the placement group state, as described by the rpc::PlacementGroupTable proto in gcs.proto.",
description='Note: not impacted by "Instance" variable.\n\nCurrent number of placement groups in a particular state.\n\nState: the placement group state, as described by the rpc::PlacementGroupTable proto in gcs.proto.',
unit="placement groups",
targets=[
Target(
Expand All @@ -208,7 +208,7 @@ def max_plus_pending(max_resource, pending_resource):
legend="CPU Usage: {{instance}} (head)",
),
Target(
expr="sum(ray_node_cpu_count{{{global_filters}}})",
expr='sum(ray_node_cpu_count{{instance=~"$Instance",{global_filters}}})',
legend="MAX",
),
],
Expand All @@ -228,7 +228,7 @@ def max_plus_pending(max_resource, pending_resource):
legend="GPU Usage: {{instance}} (head), gpu.{{GpuIndex}}, {{GpuDeviceName}}",
),
Target(
expr="sum(ray_node_gpus_available{{{global_filters}}})",
expr='sum(ray_node_gpus_available{{instance=~"$Instance",{global_filters}}})',
legend="MAX",
),
],
Expand All @@ -248,7 +248,7 @@ def max_plus_pending(max_resource, pending_resource):
legend="Disk Used: {{instance}} (head)",
),
Target(
expr="sum(ray_node_disk_free{{{global_filters}}}) + sum(ray_node_disk_usage{{{global_filters}}})",
expr='sum(ray_node_disk_free{{instance=~"$Instance",{global_filters}}}) + sum(ray_node_disk_usage{{instance=~"$Instance",{global_filters}}})',
legend="MAX",
),
],
Expand Down Expand Up @@ -292,7 +292,7 @@ def max_plus_pending(max_resource, pending_resource):
legend="Memory Used: {{instance}} (head)",
),
Target(
expr="sum(ray_node_mem_total{{{global_filters}}})",
expr='sum(ray_node_mem_total{{instance=~"$Instance",{global_filters}}})',
legend="MAX",
),
],
Expand Down Expand Up @@ -334,15 +334,15 @@ def max_plus_pending(max_resource, pending_resource):
unit="bytes",
targets=[
Target(
expr="(sum(ray_component_rss_mb{{{global_filters}}} * 1e6) by (Component)) - (sum(ray_component_mem_shared_bytes{{{global_filters}}}) by (Component))",
expr='(sum(ray_component_rss_mb{{instance=~"$Instance",{global_filters}}} * 1e6) by (Component)) - (sum(ray_component_mem_shared_bytes{{instance=~"$Instance",{global_filters}}}) by (Component))',
legend="{{Component}}",
),
Target(
expr="sum(ray_node_mem_shared_bytes{{{global_filters}}})",
expr='sum(ray_node_mem_shared_bytes{{instance=~"$Instance",{global_filters}}})',
legend="shared_memory",
),
Target(
expr="sum(ray_node_mem_total{{{global_filters}}})",
expr='sum(ray_node_mem_total{{instance=~"$Instance",{global_filters}}})',
legend="MAX",
),
],
Expand All @@ -355,11 +355,11 @@ def max_plus_pending(max_resource, pending_resource):
targets=[
Target(
# ray_component_cpu_percentage returns a percentage that can be > 100. It means that it uses more than 1 CPU.
expr="sum(ray_component_cpu_percentage{{{global_filters}}}) by (Component) / 100",
expr='sum(ray_component_cpu_percentage{{instance=~"$Instance",{global_filters}}}) by (Component) / 100',
legend="{{Component}}",
),
Target(
expr="sum(ray_node_cpu_count{{{global_filters}}})",
expr='sum(ray_node_cpu_count{{instance=~"$Instance",{global_filters}}})',
legend="MAX",
),
],
Expand All @@ -375,7 +375,7 @@ def max_plus_pending(max_resource, pending_resource):
legend="Used GRAM: {{instance}}, gpu.{{GpuIndex}}, {{GpuDeviceName}}",
),
Target(
expr="(sum(ray_node_gram_available{{{global_filters}}}) + sum(ray_node_gram_used{{{global_filters}}})) * 1024 * 1024",
expr='(sum(ray_node_gram_available{{instance=~"$Instance",{global_filters}}}) + sum(ray_node_gram_used{{instance=~"$Instance",{global_filters}}})) * 1024 * 1024',
legend="MAX",
),
],
Expand All @@ -399,7 +399,7 @@ def max_plus_pending(max_resource, pending_resource):
Panel(
id=24,
title="Node Count",
description="A total number of active failed, and pending nodes from the cluster. \n\nACTIVE: A node is alive and available.\n\nFAILED: A node is dead and not available. The node is considered dead when the raylet process on the node is terminated. The node will get into the failed state if it cannot be provided (e.g., there's no available node from the cloud provider) or failed to setup (e.g., setup_commands have errors). \n\nPending: A node is being started by the Ray cluster launcher. The node is unavailable now because it is being provisioned and initialized.",
description='Note: not impacted by "Instance" variable.\n\nA total number of active failed, and pending nodes from the cluster. \n\nACTIVE: A node is alive and available.\n\nFAILED: A node is dead and not available. The node is considered dead when the raylet process on the node is terminated. The node will get into the failed state if it cannot be provided (e.g., there\'s no available node from the cloud provider) or failed to setup (e.g., setup_commands have errors). \n\nPending: A node is being started by the Ray cluster launcher. The node is unavailable now because it is being provisioned and initialized.',
unit="nodes",
targets=[
Target(
Expand All @@ -424,32 +424,32 @@ def max_plus_pending(max_resource, pending_resource):
targets=[
# CPU
Target(
expr="avg(ray_node_cpu_utilization{{{global_filters}}})",
expr='avg(ray_node_cpu_utilization{{instance=~"$Instance",{global_filters}}})',
legend="CPU (physical)",
),
# GPU
Target(
expr="sum(ray_node_gpus_utilization{{{global_filters}}}) / on() (sum(autoscaler_cluster_resources{{resource='GPU',{global_filters}}}) or vector(0))",
expr='sum(ray_node_gpus_utilization{{instance=~"$Instance",{global_filters}}}) / on() (sum(autoscaler_cluster_resources{{resource="GPU",instance=~"$Instance",{global_filters}}}) or vector(0))',
legend="GPU (physical)",
),
# Memory
Target(
expr="sum(ray_node_mem_used{{{global_filters}}}) / on() (sum(ray_node_mem_total{{{global_filters}}})) * 100",
expr='sum(ray_node_mem_used{{instance=~"$Instance",{global_filters}}}) / on() (sum(ray_node_mem_total{{instance=~"$Instance",{global_filters}}})) * 100',
legend="Memory (RAM)",
),
# GRAM
Target(
expr="sum(ray_node_gram_used{{{global_filters}}}) / on() (sum(ray_node_gram_available{{{global_filters}}}) + sum(ray_node_gram_used{{{global_filters}}})) * 100",
expr='sum(ray_node_gram_used{{instance=~"$Instance",{global_filters}}}) / on() (sum(ray_node_gram_available{{instance=~"$Instance",{global_filters}}}) + sum(ray_node_gram_used{{instance=~"$Instance",{global_filters}}})) * 100',
legend="GRAM",
),
# Object Store
Target(
expr='sum(ray_object_store_memory{{{global_filters}}}) / on() sum(ray_resources{{Name="object_store_memory",{global_filters}}}) * 100',
expr='sum(ray_object_store_memory{{instance=~"$Instance",{global_filters}}}) / on() sum(ray_resources{{Name="object_store_memory",instance=~"$Instance",{global_filters}}}) * 100',
legend="Object Store Memory",
),
# Disk
Target(
expr="sum(ray_node_disk_usage{{{global_filters}}}) / on() (sum(ray_node_disk_free{{{global_filters}}}) + sum(ray_node_disk_usage{{{global_filters}}})) * 100",
expr='sum(ray_node_disk_usage{{instance=~"$Instance",{global_filters}}}) / on() (sum(ray_node_disk_free{{instance=~"$Instance",{global_filters}}}) + sum(ray_node_disk_usage{{instance=~"$Instance",{global_filters}}})) * 100',
legend="Disk",
),
],
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -73,15 +73,11 @@
"allValue": ".+",
"current": {
"selected": true,
"text": [
"All"
],
"value": [
"$__all"
]
"text": ["All"],
"value": ["$__all"]
},
"datasource": "${datasource}",
"definition": "label_values(ray_node_network_receive_speed{{SessionName=\"$SessionName\",{global_filters}}}, instance)",
"definition": "label_values(ray_node_network_receive_speed{{SessionName=~\"$SessionName\",{global_filters}}}, instance)",
"description": null,
"error": null,
"hide": 0,
Expand All @@ -91,7 +87,7 @@
"name": "Instance",
"options": [],
"query": {
"query": "label_values(ray_node_network_receive_speed{{SessionName=\"$SessionName\",{global_filters}}}, instance)",
"query": "label_values(ray_node_network_receive_speed{{SessionName=~\"$SessionName\",{global_filters}}}, instance)",
"refId": "Prometheus-Instance-Variable-Query"
},
"refresh": 2,
Expand Down

0 comments on commit aa3986a

Please sign in to comment.