diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index 7c16690b7c51b..dc231fa07ed2d 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -367,6 +367,11 @@ datadog_checks_base/datadog_checks/base/checks/windows/ @DataDog/wi /temporal_cloud/manifest.json @DataDog/saas-integrations @DataDog/documentation /temporal_cloud/metadata.csv @DataDog/saas-integrations @DataDog/documentation +/temporal_cloud/ @DataDog/saas-integrations +/temporal_cloud/*.md @DataDog/saas-integrations @DataDog/documentation +/temporal_cloud/manifest.json @DataDog/saas-integrations @DataDog/documentation +/temporal_cloud/metadata.csv @DataDog/saas-integrations @DataDog/documentation + /trend_micro_email_security/ @DataDog/saas-integrations /trend_micro_email_security/*.md @DataDog/saas-integrations @DataDog/documentation /trend_micro_email_security/manifest.json @DataDog/saas-integrations @DataDog/documentation diff --git a/temporal_cloud/CHANGELOG.md b/temporal_cloud/CHANGELOG.md index a64023ed3d344..b457162511a56 100644 --- a/temporal_cloud/CHANGELOG.md +++ b/temporal_cloud/CHANGELOG.md @@ -1,4 +1,4 @@ -# CHANGELOG - Temporal_Cloud +# CHANGELOG - Temporal Cloud ## 1.0.0 / 2024-11-26 diff --git a/temporal_cloud/README.md b/temporal_cloud/README.md index c780d1f397562..13dc7db2355a0 100644 --- a/temporal_cloud/README.md +++ b/temporal_cloud/README.md @@ -1,41 +1,56 @@ ## Overview -This check monitors [Temporal_Cloud][1]. +[Temporal Cloud][1] is a scalable platform for orchestrating complex workflows, with built-in reliability, resilience, and timing controls. Temporal Cloud enables developers to focus on application logic without worrying about fault tolerance and consistency. + + +This integration gathers Temporal Cloud metrics into Datadog, offering insights into system health, workflow efficiency, task execution, and performance bottlenecks. ## Setup -### Installation +### Generate a Metrics endpoint URL in Temporal Cloud + +1. To generate a CA certificate and an end-entity certificate, see [certificate management][2]. + - **Note**: An expired root CA certificate invalidates all downstream certificates. To avoid disruptions to your systems, use certificates with long validity periods. +2. Log in to [Temporal Cloud][3] with an account owner or global admin role. +3. Go to **Settings**, and select the **Observability** tab. +4. Under the **Certificates** section, add your root CA certificate (`.pem` file content) and save it. + - **Note**: If an observability endpoint is already set up, you can append your root CA certificate. +5. Click **Save** to generate the endpoint URL under the **Endpoint** section. The URL should look like: `https://.tmprl.cloud/prometheus`. -The Temporal_Cloud check is included in the [Datadog Agent][2] package. -No additional installation is needed on your server. -### Configuration +### Connect your Temporal Cloud account to Datadog -!!! Add list of steps to set up this integration !!! +1. Add your Account ID, End-entity Certificate file content, and End-entity Certificate key file content + |Parameters|Description| + |--------------------|--------------------| + |Account ID|Temporal Cloud account ID to be used as part of the metrics endpoint URL: `https://.tmprl.cloud/prometheus`.| + |End-entity certificate file content|Contents of the end-entity certificate for secure access and communication with the Metrics endpoint.| + |End-entity certificate key file content|Content of the end-entity certificate key for secure access and communication with the Metrics endpoint.| -### Validation +2. Click the **Save** button to save your settings. -!!! Add steps to validate integration is functioning as expected !!! ## Data Collected ### Metrics -Temporal_Cloud does not include any metrics. +See [metadata.csv][4] for a list of metrics provided by this integration. + ### Service Checks -Temporal_Cloud does not include any service checks. +The Temporal Cloud integration does not include any service checks. ### Events -Temporal_Cloud does not include any events. - -## Troubleshooting +The Temporal Cloud integration does not include any events. -Need help? Contact [Datadog support][3]. +## Support -[1]: **LINK_TO_INTEGRATION_SITE** -[2]: https://app.datadoghq.com/account/settings/agent/latest -[3]: https://docs.datadoghq.com/help/ +Need help? Contact [Datadog support][5]. +[1]: https://temporal.io/cloud/ +[2]: https://docs.temporal.io/cloud/certificates#use-certstrap/ +[3]: https://cloud.temporal.io/ +[4]: https://github.com/DataDog/integrations-core/blob/master/temporal_cloud/metadata.csv +[5]: https://docs.datadoghq.com/help/ diff --git a/temporal_cloud/assets/dashboards/temporal_cloud_overview.json b/temporal_cloud/assets/dashboards/temporal_cloud_overview.json new file mode 100644 index 0000000000000..1c84d8b87d085 --- /dev/null +++ b/temporal_cloud/assets/dashboards/temporal_cloud_overview.json @@ -0,0 +1,3771 @@ +{ + "title": "Temporal Cloud - Overview", + "description": "This dashboard provides insights into system health, performance and workflow efficiency for your Temporal Cloud instance.", + "widgets": [ + { + "id": 8740298734186812, + "definition": { + "type": "image", + "url": "https://images.ctfassets.net/0uuz8ydxyd9p/6lHpuU1sKtTBbWj6VS1Llh/7d48148041d51d513c5820cb1a0e7d5d/Temporal_LogoLockup_Horizontal_dark_1_2x.png", + "url_dark_theme": "https://images.ctfassets.net/0uuz8ydxyd9p/2ctnUPEhKA75tYnrl2Kzvj/90563965bc4ea2af9442b6eb4ba43180/Temporal_LogoLockup_Horizontal_light_1_2x.png", + "sizing": "contain", + "margin": "sm", + "has_background": true, + "has_border": false, + "vertical_align": "center", + "horizontal_align": "center" + }, + "layout": { + "x": 0, + "y": 0, + "width": 6, + "height": 2 + } + }, + { + "id": 5349449283720096, + "definition": { + "title": "Monitors Summary", + "background_color": "vivid_blue", + "show_title": true, + "type": "group", + "layout_type": "ordered", + "widgets": [ + { + "id": 8407084925998778, + "definition": { + "title": "Monitors Summary", + "type": "manage_status", + "display_format": "countsAndList", + "color_preference": "text", + "hide_zero_counts": true, + "show_status": true, + "last_triggered_format": "relative", + "query": "tag:(integration:temporal-cloud)", + "sort": "status,asc", + "count": 50, + "start": 0, + "summary_type": "monitors", + "show_priority": false, + "show_last_triggered": false + }, + "layout": { + "x": 0, + "y": 0, + "width": 6, + "height": 4 + } + } + ] + }, + "layout": { + "x": 6, + "y": 0, + "width": 6, + "height": 5 + } + }, + { + "id": 323399805713430, + "definition": { + "type": "note", + "content": "**[Temporal Cloud](https://temporal.io/cloud)** streamlines scalable application development by orchestrating workflows, retries, and state management.\n\nThis dashboard provides insights into system health, performance and workflow efficiency for your Temporal Cloud instance.\n\nFor more information, see the [Temporal Cloud Integration Documentation](https://docs.datadoghq.com/integrations/temporal_cloud/).\n\n**Tip**:\n- Clone this dashboard to rearrange, modify and add widgets and visualizations.", + "background_color": "gray", + "font_size": "14", + "text_align": "left", + "vertical_align": "top", + "show_tick": true, + "tick_pos": "50%", + "tick_edge": "top", + "has_padding": true + }, + "layout": { + "x": 0, + "y": 2, + "width": 6, + "height": 3 + } + }, + { + "id": 6745713151482410, + "definition": { + "title": "Service Latency Metrics", + "background_color": "vivid_blue", + "show_title": true, + "type": "group", + "layout_type": "ordered", + "widgets": [ + { + "id": 6596360524579660, + "definition": { + "title": "Avg StartWorkflowExecution Service Latency (P50)", + "title_size": "16", + "title_align": "left", + "type": "query_value", + "requests": [ + { + "response_format": "scalar", + "queries": [ + { + "name": "query1", + "data_source": "metrics", + "query": "avg:temporal_cloud.cloud_metrics.v0_service_latency_p50{operation:startworkflowexecution,$Namespace}", + "aggregator": "avg" + } + ], + "formulas": [ + { + "number_format": { + "unit": { + "type": "canonical_unit", + "unit_name": "second" + } + }, + "formula": "query1" + } + ], + "conditional_formats": [ + { + "comparator": "<=", + "value": 0.1, + "palette": "black_on_light_green" + }, + { + "comparator": ">", + "value": 0.1, + "palette": "black_on_light_yellow" + }, + { + "comparator": ">", + "value": 0.2, + "palette": "black_on_light_red" + } + ] + } + ], + "autoscale": false, + "precision": 2, + "timeseries_background": { + "yaxis": { + "include_zero": true + }, + "type": "area" + } + }, + "layout": { + "x": 0, + "y": 0, + "width": 4, + "height": 3 + } + }, + { + "id": 8775051559517178, + "definition": { + "title": "Avg SignalWorkflowExecution Service Latency (P50)", + "title_size": "16", + "title_align": "left", + "type": "query_value", + "requests": [ + { + "response_format": "scalar", + "queries": [ + { + "name": "query1", + "data_source": "metrics", + "query": "avg:temporal_cloud.cloud_metrics.v0_service_latency_p50{operation:signalworkflowexecution,$Namespace}", + "aggregator": "avg" + } + ], + "formulas": [ + { + "number_format": { + "unit": { + "type": "canonical_unit", + "unit_name": "second" + } + }, + "formula": "query1" + } + ], + "conditional_formats": [ + { + "comparator": "<=", + "value": 0.1, + "palette": "black_on_light_green", + "custom_bg_color": "#f82a2a" + }, + { + "comparator": ">", + "value": 0.1, + "palette": "black_on_light_yellow", + "custom_bg_color": "#3cec7f" + }, + { + "comparator": ">", + "value": 0.2, + "palette": "black_on_light_red" + } + ] + } + ], + "autoscale": false, + "precision": 2, + "timeseries_background": { + "type": "area" + } + }, + "layout": { + "x": 4, + "y": 0, + "width": 4, + "height": 3 + } + }, + { + "id": 2585371837746560, + "definition": { + "title": "Avg SignalWithStartWorkflowExecution Service Latency (P50)", + "title_size": "16", + "title_align": "left", + "type": "query_value", + "requests": [ + { + "response_format": "scalar", + "queries": [ + { + "name": "query1", + "data_source": "metrics", + "query": "avg:temporal_cloud.cloud_metrics.v0_service_latency_p50{operation:signalwithstartworkflowexecution,$Namespace}", + "aggregator": "avg" + } + ], + "formulas": [ + { + "number_format": { + "unit": { + "type": "canonical_unit", + "unit_name": "second" + } + }, + "formula": "query1" + } + ], + "conditional_formats": [ + { + "comparator": "<=", + "value": 0.1, + "palette": "black_on_light_green", + "custom_bg_color": "#9f1e1e" + }, + { + "comparator": ">", + "value": 0.1, + "palette": "black_on_light_yellow", + "custom_bg_color": "#73e28f" + }, + { + "comparator": ">", + "value": 0.2, + "palette": "black_on_light_red" + } + ] + } + ], + "autoscale": false, + "precision": 2, + "timeseries_background": { + "yaxis": { + "include_zero": true + }, + "type": "area" + } + }, + "layout": { + "x": 8, + "y": 0, + "width": 4, + "height": 3 + } + }, + { + "id": 8602979691001942, + "definition": { + "title": "Service Latency (P50) over Time", + "title_size": "16", + "title_align": "left", + "show_legend": true, + "legend_layout": "auto", + "legend_columns": [ + "avg", + "min", + "max", + "value", + "sum" + ], + "type": "timeseries", + "requests": [ + { + "formulas": [ + { + "formula": "query1" + } + ], + "queries": [ + { + "name": "query1", + "data_source": "metrics", + "query": "avg:temporal_cloud.cloud_metrics.v0_service_latency_p50{$Namespace} by {operation}" + } + ], + "response_format": "timeseries", + "style": { + "palette": "dog_classic", + "order_by": "values", + "line_type": "solid", + "line_width": "normal" + }, + "display_type": "line" + } + ] + }, + "layout": { + "x": 0, + "y": 3, + "width": 12, + "height": 4 + } + }, + { + "id": 2720217781741226, + "definition": { + "title": "Avg StartWorkflowExecution Service Latency (P90)", + "title_size": "16", + "title_align": "left", + "type": "query_value", + "requests": [ + { + "response_format": "scalar", + "queries": [ + { + "name": "query1", + "data_source": "metrics", + "query": "avg:temporal_cloud.cloud_metrics.v0_service_latency_p90{operation:startworkflowexecution,$Namespace}", + "aggregator": "avg" + } + ], + "formulas": [ + { + "number_format": { + "unit": { + "type": "canonical_unit", + "unit_name": "second" + } + }, + "formula": "query1" + } + ], + "conditional_formats": [ + { + "comparator": "<=", + "value": 0.1, + "palette": "black_on_light_green" + }, + { + "comparator": ">", + "value": 0.1, + "palette": "black_on_light_yellow" + }, + { + "comparator": ">", + "value": 0.2, + "palette": "black_on_light_red" + } + ] + } + ], + "autoscale": false, + "precision": 2, + "timeseries_background": { + "yaxis": { + "include_zero": true + }, + "type": "area" + } + }, + "layout": { + "x": 0, + "y": 7, + "width": 4, + "height": 3 + } + }, + { + "id": 3521776471353254, + "definition": { + "title": "Avg SignalWorkflowExecution Service Latency (P90)", + "title_size": "16", + "title_align": "left", + "type": "query_value", + "requests": [ + { + "response_format": "scalar", + "queries": [ + { + "name": "query1", + "data_source": "metrics", + "query": "avg:temporal_cloud.cloud_metrics.v0_service_latency_p90{operation:signalworkflowexecution,$Namespace}", + "aggregator": "avg" + } + ], + "formulas": [ + { + "number_format": { + "unit": { + "type": "canonical_unit", + "unit_name": "second" + } + }, + "formula": "query1" + } + ], + "conditional_formats": [ + { + "comparator": "<=", + "value": 0.1, + "palette": "black_on_light_green", + "custom_bg_color": "#f82a2a" + }, + { + "comparator": ">", + "value": 0.1, + "palette": "black_on_light_yellow", + "custom_bg_color": "#3cec7f" + }, + { + "comparator": ">", + "value": 0.2, + "palette": "black_on_light_red" + } + ] + } + ], + "autoscale": false, + "precision": 2, + "timeseries_background": { + "type": "area" + } + }, + "layout": { + "x": 4, + "y": 7, + "width": 4, + "height": 3 + } + }, + { + "id": 6556065257189900, + "definition": { + "title": "Avg SignalWithStartWorkflowExecution Service Latency (P90)", + "title_size": "16", + "title_align": "left", + "type": "query_value", + "requests": [ + { + "response_format": "scalar", + "queries": [ + { + "name": "query1", + "data_source": "metrics", + "query": "avg:temporal_cloud.cloud_metrics.v0_service_latency_p90{operation:signalwithstartworkflowexecution,$Namespace}", + "aggregator": "avg" + } + ], + "formulas": [ + { + "number_format": { + "unit": { + "type": "canonical_unit", + "unit_name": "second" + } + }, + "formula": "query1" + } + ], + "conditional_formats": [ + { + "comparator": "<=", + "value": 0.1, + "palette": "black_on_light_green", + "custom_bg_color": "#9f1e1e" + }, + { + "comparator": ">", + "value": 0.1, + "palette": "black_on_light_yellow", + "custom_bg_color": "#73e28f" + }, + { + "comparator": ">", + "value": 0.2, + "palette": "black_on_light_red" + } + ] + } + ], + "autoscale": false, + "precision": 2, + "timeseries_background": { + "yaxis": { + "include_zero": true + }, + "type": "area" + } + }, + "layout": { + "x": 8, + "y": 7, + "width": 4, + "height": 3 + } + }, + { + "id": 7831482080727952, + "definition": { + "title": "Service Latency (P90) over Time", + "title_size": "16", + "title_align": "left", + "show_legend": true, + "legend_layout": "auto", + "legend_columns": [ + "avg", + "min", + "max", + "value", + "sum" + ], + "type": "timeseries", + "requests": [ + { + "formulas": [ + { + "formula": "query1" + } + ], + "queries": [ + { + "name": "query1", + "data_source": "metrics", + "query": "avg:temporal_cloud.cloud_metrics.v0_service_latency_p90{$Namespace} by {operation}" + } + ], + "response_format": "timeseries", + "style": { + "palette": "dog_classic", + "order_by": "values", + "line_type": "solid", + "line_width": "normal" + }, + "display_type": "line" + } + ] + }, + "layout": { + "x": 0, + "y": 10, + "width": 12, + "height": 4 + } + }, + { + "id": 3811540902858216, + "definition": { + "title": "Avg StartWorkflowExecution Service Latency (P95)", + "title_size": "16", + "title_align": "left", + "type": "query_value", + "requests": [ + { + "response_format": "scalar", + "queries": [ + { + "name": "query1", + "data_source": "metrics", + "query": "avg:temporal_cloud.cloud_metrics.v0_service_latency_p95{operation:startworkflowexecution,$Namespace}", + "aggregator": "avg" + } + ], + "formulas": [ + { + "number_format": { + "unit": { + "type": "canonical_unit", + "unit_name": "second" + } + }, + "formula": "query1" + } + ], + "conditional_formats": [ + { + "comparator": "<=", + "value": 0.1, + "palette": "black_on_light_green" + }, + { + "comparator": ">", + "value": 0.1, + "palette": "black_on_light_yellow" + }, + { + "comparator": ">", + "value": 0.2, + "palette": "black_on_light_red" + } + ] + } + ], + "autoscale": false, + "precision": 2, + "timeseries_background": { + "yaxis": { + "include_zero": true + }, + "type": "area" + } + }, + "layout": { + "x": 0, + "y": 14, + "width": 4, + "height": 3 + } + }, + { + "id": 6386629657872384, + "definition": { + "title": "Avg SignalWorkflowExecution Service Latency (P95)", + "title_size": "16", + "title_align": "left", + "type": "query_value", + "requests": [ + { + "response_format": "scalar", + "queries": [ + { + "name": "query1", + "data_source": "metrics", + "query": "avg:temporal_cloud.cloud_metrics.v0_service_latency_p95{operation:signalworkflowexecution,$Namespace}", + "aggregator": "avg" + } + ], + "formulas": [ + { + "number_format": { + "unit": { + "type": "canonical_unit", + "unit_name": "second" + } + }, + "formula": "query1" + } + ], + "conditional_formats": [ + { + "comparator": "<=", + "value": 0.1, + "palette": "black_on_light_green", + "custom_bg_color": "#f82a2a" + }, + { + "comparator": ">", + "value": 0.1, + "palette": "black_on_light_yellow", + "custom_bg_color": "#3cec7f" + }, + { + "comparator": ">", + "value": 0.2, + "palette": "black_on_light_red" + } + ] + } + ], + "autoscale": false, + "precision": 2, + "timeseries_background": { + "type": "area" + } + }, + "layout": { + "x": 4, + "y": 14, + "width": 4, + "height": 3 + } + }, + { + "id": 5711438364606902, + "definition": { + "title": "Avg SignalWithStartWorkflowExecution Service Latency (P95)", + "title_size": "16", + "title_align": "left", + "type": "query_value", + "requests": [ + { + "response_format": "scalar", + "queries": [ + { + "name": "query1", + "data_source": "metrics", + "query": "avg:temporal_cloud.cloud_metrics.v0_service_latency_p95{operation:signalwithstartworkflowexecution,$Namespace}", + "aggregator": "avg" + } + ], + "formulas": [ + { + "number_format": { + "unit": { + "type": "canonical_unit", + "unit_name": "second" + } + }, + "formula": "query1" + } + ], + "conditional_formats": [ + { + "comparator": "<=", + "value": 0.1, + "palette": "black_on_light_green", + "custom_bg_color": "#9f1e1e" + }, + { + "comparator": ">", + "value": 0.1, + "palette": "black_on_light_yellow", + "custom_bg_color": "#73e28f" + }, + { + "comparator": ">", + "value": 0.2, + "palette": "black_on_light_red" + } + ] + } + ], + "autoscale": false, + "precision": 2, + "timeseries_background": { + "yaxis": { + "include_zero": true + }, + "type": "area" + } + }, + "layout": { + "x": 8, + "y": 14, + "width": 4, + "height": 3 + } + }, + { + "id": 5217501760119796, + "definition": { + "title": "Service Latency (P95) over Time", + "title_size": "16", + "title_align": "left", + "show_legend": true, + "legend_layout": "auto", + "legend_columns": [ + "avg", + "min", + "max", + "value", + "sum" + ], + "type": "timeseries", + "requests": [ + { + "formulas": [ + { + "formula": "query1" + } + ], + "queries": [ + { + "name": "query1", + "data_source": "metrics", + "query": "avg:temporal_cloud.cloud_metrics.v0_service_latency_p95{$Namespace} by {operation}" + } + ], + "response_format": "timeseries", + "style": { + "palette": "dog_classic", + "order_by": "values", + "line_type": "solid", + "line_width": "normal" + }, + "display_type": "line" + } + ] + }, + "layout": { + "x": 0, + "y": 17, + "width": 12, + "height": 4 + } + }, + { + "id": 7206975012378310, + "definition": { + "title": "Avg StartWorkflowExecution Service Latency (P99)", + "title_size": "16", + "title_align": "left", + "type": "query_value", + "requests": [ + { + "response_format": "scalar", + "queries": [ + { + "name": "query1", + "data_source": "metrics", + "query": "avg:temporal_cloud.cloud_metrics.v0_service_latency_p99{operation:startworkflowexecution,$Namespace}", + "aggregator": "avg" + } + ], + "formulas": [ + { + "number_format": { + "unit": { + "type": "canonical_unit", + "unit_name": "second" + } + }, + "formula": "query1" + } + ], + "conditional_formats": [ + { + "comparator": "<=", + "value": 0.1, + "palette": "black_on_light_green" + }, + { + "comparator": ">", + "value": 0.1, + "palette": "black_on_light_yellow" + }, + { + "comparator": ">", + "value": 0.2, + "palette": "black_on_light_red" + } + ] + } + ], + "autoscale": false, + "precision": 2, + "timeseries_background": { + "yaxis": { + "include_zero": true + }, + "type": "area" + } + }, + "layout": { + "x": 0, + "y": 21, + "width": 4, + "height": 3 + } + }, + { + "id": 9005631171257400, + "definition": { + "title": "Avg SignalWorkflowExecution Service Latency (P99)", + "title_size": "16", + "title_align": "left", + "type": "query_value", + "requests": [ + { + "response_format": "scalar", + "queries": [ + { + "name": "query1", + "data_source": "metrics", + "query": "avg:temporal_cloud.cloud_metrics.v0_service_latency_p99{operation:signalworkflowexecution,$Namespace}", + "aggregator": "avg" + } + ], + "formulas": [ + { + "number_format": { + "unit": { + "type": "canonical_unit", + "unit_name": "second" + } + }, + "formula": "query1" + } + ], + "conditional_formats": [ + { + "comparator": "<=", + "value": 0.1, + "palette": "black_on_light_green", + "custom_bg_color": "#f82a2a" + }, + { + "comparator": ">", + "value": 0.1, + "palette": "black_on_light_yellow", + "custom_bg_color": "#3cec7f" + }, + { + "comparator": ">", + "value": 0.2, + "palette": "black_on_light_red" + } + ] + } + ], + "autoscale": false, + "precision": 2, + "timeseries_background": { + "type": "area" + } + }, + "layout": { + "x": 4, + "y": 21, + "width": 4, + "height": 3 + } + }, + { + "id": 8404232948383498, + "definition": { + "title": "Avg SignalWithStartWorkflowExecution Service Latency (P99)", + "title_size": "16", + "title_align": "left", + "type": "query_value", + "requests": [ + { + "response_format": "scalar", + "queries": [ + { + "name": "query1", + "data_source": "metrics", + "query": "avg:temporal_cloud.cloud_metrics.v0_service_latency_p99{operation:signalwithstartworkflowexecution,$Namespace}", + "aggregator": "avg" + } + ], + "formulas": [ + { + "number_format": { + "unit": { + "type": "canonical_unit", + "unit_name": "second" + } + }, + "formula": "query1" + } + ], + "conditional_formats": [ + { + "comparator": "<=", + "value": 0.1, + "palette": "black_on_light_green", + "custom_bg_color": "#9f1e1e" + }, + { + "comparator": ">", + "value": 0.1, + "palette": "black_on_light_yellow", + "custom_bg_color": "#73e28f" + }, + { + "comparator": ">", + "value": 0.2, + "palette": "black_on_light_red" + } + ] + } + ], + "autoscale": false, + "precision": 2, + "timeseries_background": { + "yaxis": { + "include_zero": true + }, + "type": "area" + } + }, + "layout": { + "x": 8, + "y": 21, + "width": 4, + "height": 3 + } + }, + { + "id": 7206412369508920, + "definition": { + "title": "Service Latency (P99) over Time", + "title_size": "16", + "title_align": "left", + "show_legend": true, + "legend_layout": "auto", + "legend_columns": [ + "avg", + "min", + "max", + "value", + "sum" + ], + "type": "timeseries", + "requests": [ + { + "formulas": [ + { + "formula": "query1" + } + ], + "queries": [ + { + "name": "query1", + "data_source": "metrics", + "query": "avg:temporal_cloud.cloud_metrics.v0_service_latency_p99{$Namespace} by {operation}" + } + ], + "response_format": "timeseries", + "style": { + "palette": "dog_classic", + "order_by": "values", + "line_type": "solid", + "line_width": "normal" + }, + "display_type": "line" + } + ] + }, + "layout": { + "x": 0, + "y": 24, + "width": 12, + "height": 4 + } + } + ] + }, + "layout": { + "x": 0, + "y": 5, + "width": 12, + "height": 1 + } + }, + { + "id": 8893308504271682, + "definition": { + "title": "Frontend Service Overview", + "background_color": "vivid_blue", + "show_title": true, + "type": "group", + "layout_type": "ordered", + "widgets": [ + { + "id": 298345148190314, + "definition": { + "title": "Avg gRPC Error Rate", + "title_size": "16", + "title_align": "left", + "type": "query_value", + "requests": [ + { + "response_format": "scalar", + "queries": [ + { + "name": "query1", + "data_source": "metrics", + "query": "sum:temporal_cloud.cloud_metrics.v0_frontend_service_error_increase1m{$Namespace}.as_rate()", + "aggregator": "avg" + } + ], + "formulas": [ + { + "formula": "query1" + } + ], + "conditional_formats": [ + { + "comparator": ">", + "value": 0, + "palette": "black_on_light_red" + } + ] + } + ], + "autoscale": false, + "precision": 2, + "timeseries_background": { + "yaxis": { + "include_zero": true + }, + "type": "area" + } + }, + "layout": { + "x": 0, + "y": 0, + "width": 3, + "height": 3 + } + }, + { + "id": 4801477252304832, + "definition": { + "title": "gRPC Error Rate over Time", + "title_size": "16", + "title_align": "left", + "show_legend": true, + "legend_layout": "auto", + "legend_columns": [ + "avg", + "min", + "max", + "value", + "sum" + ], + "type": "timeseries", + "requests": [ + { + "formulas": [ + { + "formula": "query1" + } + ], + "queries": [ + { + "name": "query1", + "data_source": "metrics", + "query": "sum:temporal_cloud.cloud_metrics.v0_frontend_service_error_increase1m{$Namespace} by {operation}.as_rate()" + } + ], + "response_format": "timeseries", + "style": { + "palette": "dog_classic", + "order_by": "values", + "line_type": "solid", + "line_width": "normal" + }, + "display_type": "line" + } + ] + }, + "layout": { + "x": 3, + "y": 0, + "width": 9, + "height": 3 + } + }, + { + "id": 7948259305138766, + "definition": { + "title": "Avg gRPC Request Rate", + "title_size": "16", + "title_align": "left", + "type": "query_value", + "requests": [ + { + "response_format": "scalar", + "queries": [ + { + "name": "query1", + "data_source": "metrics", + "query": "sum:temporal_cloud.cloud_metrics.v0_frontend_service_request_increase1m{$Namespace}.as_rate()", + "aggregator": "avg" + } + ], + "formulas": [ + { + "formula": "query1" + } + ], + "conditional_formats": [ + { + "comparator": ">", + "value": 0, + "palette": "black_on_light_green" + } + ] + } + ], + "autoscale": false, + "precision": 2, + "timeseries_background": { + "yaxis": { + "include_zero": true + }, + "type": "area" + } + }, + "layout": { + "x": 0, + "y": 3, + "width": 3, + "height": 3 + } + }, + { + "id": 2182787751685328, + "definition": { + "title": "gRPC Request Rate over Time", + "title_size": "16", + "title_align": "left", + "show_legend": true, + "legend_layout": "auto", + "legend_columns": [ + "avg", + "min", + "max", + "value", + "sum" + ], + "type": "timeseries", + "requests": [ + { + "formulas": [ + { + "formula": "query1" + } + ], + "queries": [ + { + "name": "query1", + "data_source": "metrics", + "query": "sum:temporal_cloud.cloud_metrics.v0_frontend_service_request_increase1m{$Namespace} by {operation}.as_rate()" + } + ], + "response_format": "timeseries", + "style": { + "palette": "dog_classic", + "order_by": "values", + "line_type": "solid", + "line_width": "normal" + }, + "display_type": "line" + } + ] + }, + "layout": { + "x": 3, + "y": 3, + "width": 9, + "height": 3 + } + }, + { + "id": 2367630247389044, + "definition": { + "title": "Avg Rate-Limited Requests Rate", + "title_size": "16", + "title_align": "left", + "type": "query_value", + "requests": [ + { + "response_format": "scalar", + "queries": [ + { + "name": "query1", + "data_source": "metrics", + "query": "sum:temporal_cloud.cloud_metrics.v0_resource_exhausted_error_increase1m{$Namespace}.as_rate()", + "aggregator": "avg" + } + ], + "formulas": [ + { + "formula": "query1" + } + ], + "conditional_formats": [ + { + "comparator": ">", + "value": 0, + "palette": "black_on_light_red", + "custom_bg_color": "#699263", + "custom_fg_color": "#729e6b" + } + ] + } + ], + "autoscale": false, + "precision": 2, + "timeseries_background": { + "yaxis": { + "include_zero": true + }, + "type": "area" + } + }, + "layout": { + "x": 0, + "y": 6, + "width": 3, + "height": 3 + } + }, + { + "id": 5987314578723806, + "definition": { + "title": "Rate-Limited Request Rate over Time", + "title_size": "16", + "title_align": "left", + "show_legend": true, + "legend_layout": "auto", + "legend_columns": [ + "avg", + "min", + "max", + "value", + "sum" + ], + "type": "timeseries", + "requests": [ + { + "formulas": [ + { + "formula": "query1" + } + ], + "queries": [ + { + "name": "query1", + "data_source": "metrics", + "query": "sum:temporal_cloud.cloud_metrics.v0_resource_exhausted_error_increase1m{$Namespace} by {resource_exhausted_cause}.as_rate()" + } + ], + "response_format": "timeseries", + "style": { + "palette": "dog_classic", + "order_by": "values", + "line_type": "solid", + "line_width": "normal" + }, + "display_type": "line" + } + ] + }, + "layout": { + "x": 3, + "y": 6, + "width": 9, + "height": 3 + } + }, + { + "id": 3903383398206538, + "definition": { + "title": "Avg State Transition Rate", + "title_size": "16", + "title_align": "left", + "type": "query_value", + "requests": [ + { + "response_format": "scalar", + "queries": [ + { + "name": "query1", + "data_source": "metrics", + "query": "sum:temporal_cloud.cloud_metrics.v0_state_transition_increase1m{$Namespace}.as_rate()", + "aggregator": "avg" + } + ], + "formulas": [ + { + "formula": "query1" + } + ], + "conditional_formats": [ + { + "comparator": ">", + "value": 0, + "palette": "black_on_light_green", + "custom_fg_color": "#b0d058" + } + ] + } + ], + "autoscale": false, + "precision": 2, + "timeseries_background": { + "yaxis": { + "include_zero": true + }, + "type": "area" + } + }, + "layout": { + "x": 0, + "y": 9, + "width": 3, + "height": 3 + } + }, + { + "id": 3169127014752062, + "definition": { + "title": "State Transition Rate over Time", + "title_size": "16", + "title_align": "left", + "show_legend": true, + "legend_layout": "auto", + "legend_columns": [ + "avg", + "min", + "max", + "value", + "sum" + ], + "type": "timeseries", + "requests": [ + { + "formulas": [ + { + "formula": "query1" + } + ], + "queries": [ + { + "name": "query1", + "data_source": "metrics", + "query": "sum:temporal_cloud.cloud_metrics.v0_state_transition_increase1m{$Namespace} by {temporal_namespace}.as_rate()" + } + ], + "response_format": "timeseries", + "style": { + "palette": "dog_classic", + "order_by": "values", + "line_type": "solid", + "line_width": "normal" + }, + "display_type": "line" + } + ] + }, + "layout": { + "x": 3, + "y": 9, + "width": 9, + "height": 3 + } + }, + { + "id": 7939379235471620, + "definition": { + "title": "Avg Actions Per Second (APS)", + "title_size": "16", + "title_align": "left", + "type": "query_value", + "requests": [ + { + "response_format": "scalar", + "queries": [ + { + "name": "query1", + "data_source": "metrics", + "query": "sum:temporal_cloud.cloud_metrics.v0_total_action_increase1m{$Namespace}.as_rate()", + "aggregator": "avg" + } + ], + "formulas": [ + { + "formula": "query1" + } + ], + "conditional_formats": [ + { + "comparator": ">", + "value": 0, + "palette": "custom_bg", + "custom_fg_color": "#f00a0a", + "custom_bg_color": "#65a8e6" + } + ] + } + ], + "autoscale": false, + "precision": 2, + "timeseries_background": { + "yaxis": { + "include_zero": true + }, + "type": "area" + } + }, + "layout": { + "x": 0, + "y": 12, + "width": 3, + "height": 3 + } + }, + { + "id": 786592074502738, + "definition": { + "title": "Actions Per Second over Time", + "title_size": "16", + "title_align": "left", + "show_legend": true, + "legend_layout": "auto", + "legend_columns": [ + "avg", + "min", + "max", + "value", + "sum" + ], + "type": "timeseries", + "requests": [ + { + "formulas": [ + { + "formula": "query1" + } + ], + "queries": [ + { + "name": "query1", + "data_source": "metrics", + "query": "sum:temporal_cloud.cloud_metrics.v0_total_action_increase1m{$Namespace} by {temporal_namespace}.as_rate()" + } + ], + "response_format": "timeseries", + "style": { + "palette": "dog_classic", + "order_by": "values", + "line_type": "solid", + "line_width": "normal" + }, + "display_type": "line" + } + ] + }, + "layout": { + "x": 3, + "y": 12, + "width": 9, + "height": 3 + } + }, + { + "id": 5981689788717980, + "definition": { + "title": "gRPC Error Percentage", + "title_size": "16", + "title_align": "left", + "type": "query_value", + "requests": [ + { + "response_format": "scalar", + "queries": [ + { + "name": "query1", + "data_source": "metrics", + "query": "sum:temporal_cloud.cloud_metrics.v0_frontend_service_error_increase1m{$Namespace}.as_rate()", + "aggregator": "avg" + }, + { + "name": "query2", + "data_source": "metrics", + "query": "sum:temporal_cloud.cloud_metrics.v0_frontend_service_request_increase1m{$Namespace}.as_rate()", + "aggregator": "avg" + } + ], + "formulas": [ + { + "number_format": { + "unit": { + "type": "canonical_unit", + "unit_name": "percent" + } + }, + "formula": "(query1 / query2) * 100" + } + ], + "conditional_formats": [ + { + "comparator": ">", + "value": 0, + "palette": "black_on_light_yellow", + "custom_fg_color": "#df7777" + }, + { + "comparator": ">", + "value": 5, + "palette": "black_on_light_red" + } + ] + } + ], + "autoscale": false, + "precision": 2, + "timeseries_background": { + "yaxis": { + "include_zero": true + }, + "type": "area" + } + }, + "layout": { + "x": 0, + "y": 15, + "width": 3, + "height": 4 + } + }, + { + "id": 4183326508294474, + "definition": { + "title": "Actions Per Second by Namespace Mode", + "title_size": "16", + "title_align": "left", + "requests": [ + { + "formulas": [ + { + "formula": "query1" + } + ], + "queries": [ + { + "name": "query1", + "data_source": "metrics", + "query": "sum:temporal_cloud.cloud_metrics.v0_total_action_increase1m{$Namespace} by {namespace_mode}.as_rate()", + "aggregator": "avg" + } + ], + "response_format": "scalar", + "style": { + "palette": "datadog16" + }, + "sort": { + "count": 500, + "order_by": [ + { + "type": "formula", + "index": 0, + "order": "desc" + } + ] + } + } + ], + "type": "sunburst", + "hide_total": false, + "legend": { + "type": "table" + } + }, + "layout": { + "x": 3, + "y": 15, + "width": 9, + "height": 4 + } + }, + { + "id": 4916903371261782, + "definition": { + "title": "Top Operations by gRPC Requests Rate", + "title_size": "16", + "title_align": "left", + "type": "toplist", + "requests": [ + { + "queries": [ + { + "name": "query1", + "data_source": "metrics", + "query": "sum:temporal_cloud.cloud_metrics.v0_frontend_service_request_increase1m{$Namespace} by {operation}.as_rate()", + "aggregator": "avg" + } + ], + "response_format": "scalar", + "formulas": [ + { + "formula": "query1" + } + ], + "sort": { + "count": 10, + "order_by": [ + { + "type": "formula", + "index": 0, + "order": "desc" + } + ] + } + } + ], + "style": { + "display": { + "type": "stacked", + "legend": "automatic" + } + } + }, + "layout": { + "x": 0, + "y": 19, + "width": 4, + "height": 4 + } + }, + { + "id": 2325501122225016, + "definition": { + "title": "Top Cause for Rate-limited Requests", + "title_size": "16", + "title_align": "left", + "type": "toplist", + "requests": [ + { + "queries": [ + { + "name": "query1", + "data_source": "metrics", + "query": "sum:temporal_cloud.cloud_metrics.v0_resource_exhausted_error_increase1m{$Namespace} by {resource_exhausted_cause}.as_rate()", + "aggregator": "avg" + } + ], + "response_format": "scalar", + "formulas": [ + { + "formula": "query1" + } + ], + "sort": { + "count": 500, + "order_by": [ + { + "type": "formula", + "index": 0, + "order": "desc" + } + ] + } + } + ], + "style": { + "display": { + "type": "stacked", + "legend": "automatic" + }, + "palette": "datadog16" + } + }, + "layout": { + "x": 4, + "y": 19, + "width": 4, + "height": 4 + } + }, + { + "id": 8249413955524886, + "definition": { + "title": "Top Operations by gRPC Error Rate", + "title_size": "16", + "title_align": "left", + "type": "toplist", + "requests": [ + { + "queries": [ + { + "name": "query1", + "data_source": "metrics", + "query": "sum:temporal_cloud.cloud_metrics.v0_frontend_service_error_increase1m{$Namespace} by {operation}.as_rate()", + "aggregator": "avg" + } + ], + "response_format": "scalar", + "formulas": [ + { + "formula": "query1" + } + ], + "sort": { + "count": 10, + "order_by": [ + { + "type": "formula", + "index": 0, + "order": "desc" + } + ] + } + } + ], + "style": { + "display": { + "type": "stacked", + "legend": "automatic" + } + } + }, + "layout": { + "x": 8, + "y": 19, + "width": 4, + "height": 4 + } + } + ] + }, + "layout": { + "x": 0, + "y": 6, + "width": 12, + "height": 1 + } + }, + { + "id": 1266663704400704, + "definition": { + "title": "Task Polling Overview", + "background_color": "vivid_blue", + "show_title": true, + "type": "group", + "layout_type": "ordered", + "widgets": [ + { + "id": 2236256623748982, + "definition": { + "title": "Avg Task Poll Success Rate", + "title_size": "16", + "title_align": "left", + "type": "query_value", + "requests": [ + { + "response_format": "scalar", + "queries": [ + { + "name": "query1", + "data_source": "metrics", + "query": "sum:temporal_cloud.cloud_metrics.v0_poll_success_increase1m{$Namespace}.as_rate()", + "aggregator": "avg" + } + ], + "formulas": [ + { + "formula": "query1" + } + ], + "conditional_formats": [ + { + "comparator": ">", + "value": 0, + "palette": "black_on_light_green" + } + ] + } + ], + "autoscale": false, + "precision": 2, + "timeseries_background": { + "yaxis": { + "include_zero": true + }, + "type": "area" + } + }, + "layout": { + "x": 0, + "y": 0, + "width": 4, + "height": 3 + } + }, + { + "id": 2495307867177718, + "definition": { + "title": "Task Poll Success Rate over Time", + "title_size": "16", + "title_align": "left", + "show_legend": true, + "legend_layout": "auto", + "legend_columns": [ + "avg", + "min", + "max", + "value", + "sum" + ], + "type": "timeseries", + "requests": [ + { + "formulas": [ + { + "formula": "query1" + } + ], + "queries": [ + { + "name": "query1", + "data_source": "metrics", + "query": "sum:temporal_cloud.cloud_metrics.v0_poll_success_increase1m{$Namespace} by {task_type}.as_rate()" + } + ], + "response_format": "timeseries", + "style": { + "palette": "dog_classic", + "order_by": "values", + "line_type": "solid", + "line_width": "normal" + }, + "display_type": "line" + } + ] + }, + "layout": { + "x": 4, + "y": 0, + "width": 8, + "height": 3 + } + }, + { + "id": 474393580792922, + "definition": { + "title": "Task Sync Match Percentage", + "title_size": "16", + "title_align": "left", + "type": "query_value", + "requests": [ + { + "response_format": "scalar", + "queries": [ + { + "name": "query1", + "data_source": "metrics", + "query": "sum:temporal_cloud.cloud_metrics.v0_poll_success_sync_increase1m{$Namespace}.as_rate()", + "aggregator": "avg" + }, + { + "name": "query2", + "data_source": "metrics", + "query": "sum:temporal_cloud.cloud_metrics.v0_poll_success_increase1m{$Namespace}.as_rate()", + "aggregator": "avg" + } + ], + "formulas": [ + { + "number_format": { + "unit": { + "type": "canonical_unit", + "unit_name": "percent" + } + }, + "formula": "(query1 / query2) * 100" + } + ], + "conditional_formats": [ + { + "comparator": ">", + "value": 0, + "palette": "black_on_light_green" + } + ] + } + ], + "autoscale": false, + "precision": 2, + "timeseries_background": { + "yaxis": { + "include_zero": true + }, + "type": "area" + } + }, + "layout": { + "x": 0, + "y": 3, + "width": 3, + "height": 3 + } + }, + { + "id": 5326722891696668, + "definition": { + "title": "Avg Task Poll Sync Success Rate", + "title_size": "16", + "title_align": "left", + "type": "query_value", + "requests": [ + { + "response_format": "scalar", + "queries": [ + { + "name": "query1", + "data_source": "metrics", + "query": "sum:temporal_cloud.cloud_metrics.v0_poll_success_sync_increase1m{$Namespace}.as_rate()", + "aggregator": "avg" + } + ], + "formulas": [ + { + "formula": "query1" + } + ], + "conditional_formats": [ + { + "comparator": ">", + "value": 0, + "palette": "black_on_light_green" + } + ] + } + ], + "autoscale": false, + "precision": 2, + "timeseries_background": { + "yaxis": { + "include_zero": true + }, + "type": "area" + } + }, + "layout": { + "x": 3, + "y": 3, + "width": 3, + "height": 3 + } + }, + { + "id": 2731591411925024, + "definition": { + "title": "Task Types by Poll Timeout", + "title_size": "16", + "title_align": "left", + "requests": [ + { + "formulas": [ + { + "formula": "query1", + "limit": { + "order": "desc" + } + } + ], + "queries": [ + { + "name": "query1", + "data_source": "metrics", + "query": "sum:temporal_cloud.cloud_metrics.v0_poll_timeout_increase1m{$Namespace} by {task_type}.as_rate()", + "aggregator": "avg" + } + ], + "response_format": "scalar", + "style": { + "palette": "datadog16" + } + } + ], + "type": "sunburst", + "legend": { + "type": "table" + } + }, + "layout": { + "x": 6, + "y": 3, + "width": 6, + "height": 6 + } + }, + { + "id": 8353431131543564, + "definition": { + "title": "Avg Task Poll Timeout Rate ", + "title_size": "16", + "title_align": "left", + "type": "query_value", + "requests": [ + { + "formulas": [ + { + "formula": "query1" + } + ], + "queries": [ + { + "name": "query1", + "data_source": "metrics", + "query": "sum:temporal_cloud.cloud_metrics.v0_poll_timeout_increase1m{$Namespace}.as_rate()", + "aggregator": "avg" + } + ], + "response_format": "scalar", + "conditional_formats": [ + { + "comparator": ">", + "value": 0, + "palette": "black_on_light_red" + } + ] + } + ], + "autoscale": false, + "precision": 2, + "timeseries_background": { + "yaxis": { + "include_zero": true + }, + "type": "area" + } + }, + "layout": { + "x": 0, + "y": 6, + "width": 3, + "height": 3 + } + }, + { + "id": 7537330670601152, + "definition": { + "title": "Poll Timeout Percentage", + "title_size": "16", + "title_align": "left", + "time": {}, + "type": "query_value", + "requests": [ + { + "formulas": [ + { + "number_format": { + "unit": { + "type": "canonical_unit", + "unit_name": "percent" + } + }, + "formula": "(query1 / (query1 + query2 + query3)) * 100" + } + ], + "queries": [ + { + "name": "query1", + "data_source": "metrics", + "query": "sum:temporal_cloud.cloud_metrics.v0_poll_timeout_increase1m{$Namespace}.as_rate()", + "aggregator": "avg" + }, + { + "name": "query2", + "data_source": "metrics", + "query": "sum:temporal_cloud.cloud_metrics.v0_poll_success_increase1m{$Namespace}.as_rate()", + "aggregator": "avg" + }, + { + "name": "query3", + "data_source": "metrics", + "query": "sum:temporal_cloud.cloud_metrics.v0_poll_success_sync_increase1m{$Namespace}.as_rate()", + "aggregator": "avg" + } + ], + "response_format": "scalar", + "conditional_formats": [ + { + "comparator": ">", + "value": 0, + "palette": "black_on_light_red" + } + ] + } + ], + "autoscale": false, + "precision": 2, + "timeseries_background": { + "yaxis": { + "include_zero": true + }, + "type": "area" + } + }, + "layout": { + "x": 3, + "y": 6, + "width": 3, + "height": 3 + } + }, + { + "id": 3860476950453548, + "definition": { + "title": "Task Types by Poll Success", + "title_size": "16", + "title_align": "left", + "requests": [ + { + "formulas": [ + { + "formula": "query1", + "limit": { + "order": "desc" + } + } + ], + "queries": [ + { + "name": "query1", + "data_source": "metrics", + "query": "sum:temporal_cloud.cloud_metrics.v0_poll_success_increase1m{$Namespace} by {task_type}.as_rate()", + "aggregator": "avg" + } + ], + "response_format": "scalar", + "style": { + "palette": "datadog16" + } + } + ], + "type": "sunburst", + "legend": { + "type": "table" + } + }, + "layout": { + "x": 0, + "y": 9, + "width": 6, + "height": 4 + } + }, + { + "id": 7112687077524268, + "definition": { + "title": "Task Types by Poll Sync Success", + "title_size": "16", + "title_align": "left", + "requests": [ + { + "formulas": [ + { + "formula": "query1", + "limit": { + "order": "desc" + } + } + ], + "queries": [ + { + "name": "query1", + "data_source": "metrics", + "query": "sum:temporal_cloud.cloud_metrics.v0_poll_success_sync_increase1m{$Namespace} by {task_type}.as_rate()", + "aggregator": "avg" + } + ], + "response_format": "scalar", + "style": { + "palette": "datadog16" + } + } + ], + "type": "sunburst", + "legend": { + "type": "table" + } + }, + "layout": { + "x": 6, + "y": 9, + "width": 6, + "height": 4 + } + } + ] + }, + "layout": { + "x": 0, + "y": 7, + "width": 12, + "height": 1, + "is_column_break": true + } + }, + { + "id": 7334379080331064, + "definition": { + "title": "Scheduled Workflow Overview", + "background_color": "vivid_blue", + "show_title": true, + "type": "group", + "layout_type": "ordered", + "widgets": [ + { + "id": 4608661273329986, + "definition": { + "title": "Avg Scheduled Workflow Success Rate", + "title_size": "16", + "title_align": "left", + "type": "query_value", + "requests": [ + { + "response_format": "scalar", + "queries": [ + { + "name": "query1", + "data_source": "metrics", + "query": "sum:temporal_cloud.cloud_metrics.v0_schedule_action_success_increase1m{$Namespace}.as_rate()", + "aggregator": "avg" + } + ], + "formulas": [ + { + "formula": "query1" + } + ], + "conditional_formats": [ + { + "comparator": ">", + "value": 0, + "palette": "black_on_light_green" + } + ] + } + ], + "autoscale": false, + "precision": 2, + "timeseries_background": { + "yaxis": { + "include_zero": true + }, + "type": "area" + } + }, + "layout": { + "x": 0, + "y": 0, + "width": 4, + "height": 3 + } + }, + { + "id": 4494292675722788, + "definition": { + "title": "Scheduled Workflow Success Rate over Time", + "title_size": "16", + "title_align": "left", + "show_legend": true, + "legend_layout": "auto", + "legend_columns": [ + "avg", + "min", + "max", + "value", + "sum" + ], + "type": "timeseries", + "requests": [ + { + "formulas": [ + { + "formula": "query1" + } + ], + "queries": [ + { + "name": "query1", + "data_source": "metrics", + "query": "sum:temporal_cloud.cloud_metrics.v0_schedule_action_success_increase1m{$Namespace} by {temporal_namespace}.as_rate()" + } + ], + "response_format": "timeseries", + "style": { + "palette": "dog_classic", + "order_by": "values", + "line_type": "solid", + "line_width": "normal" + }, + "display_type": "line" + } + ] + }, + "layout": { + "x": 4, + "y": 0, + "width": 8, + "height": 3 + } + }, + { + "id": 6473814659260960, + "definition": { + "title": "Avg Buffer Overrun Rate", + "title_size": "16", + "title_align": "left", + "type": "query_value", + "requests": [ + { + "response_format": "scalar", + "queries": [ + { + "name": "query1", + "data_source": "metrics", + "query": "sum:temporal_cloud.cloud_metrics.v0_schedule_buffer_overruns_increase1m{$Namespace}.as_rate()", + "aggregator": "avg" + } + ], + "formulas": [ + { + "formula": "query1" + } + ], + "conditional_formats": [ + { + "comparator": ">", + "value": 0, + "palette": "black_on_light_red" + } + ] + } + ], + "autoscale": false, + "precision": 2, + "timeseries_background": { + "yaxis": { + "include_zero": true + }, + "type": "area" + } + }, + "layout": { + "x": 0, + "y": 3, + "width": 4, + "height": 3 + } + }, + { + "id": 1932274358183966, + "definition": { + "title": "Buffer Overrun Rate over Time", + "title_size": "16", + "title_align": "left", + "show_legend": true, + "legend_layout": "auto", + "legend_columns": [ + "avg", + "min", + "max", + "value", + "sum" + ], + "type": "timeseries", + "requests": [ + { + "formulas": [ + { + "formula": "query1" + } + ], + "queries": [ + { + "name": "query1", + "data_source": "metrics", + "query": "sum:temporal_cloud.cloud_metrics.v0_schedule_buffer_overruns_increase1m{$Namespace} by {temporal_namespace}.as_rate()" + } + ], + "response_format": "timeseries", + "style": { + "palette": "dog_classic", + "order_by": "values", + "line_type": "solid", + "line_width": "normal" + }, + "display_type": "line" + } + ] + }, + "layout": { + "x": 4, + "y": 3, + "width": 8, + "height": 3 + } + }, + { + "id": 4132996472752296, + "definition": { + "title": "Avg Missed Catch-Up Window Rate", + "title_size": "16", + "title_align": "left", + "time": {}, + "type": "query_value", + "requests": [ + { + "response_format": "scalar", + "queries": [ + { + "name": "query1", + "data_source": "metrics", + "query": "sum:temporal_cloud.cloud_metrics.v0_schedule_missed_catchup_window_increase1m{$Namespace}.as_rate()", + "aggregator": "avg" + } + ], + "formulas": [ + { + "formula": "query1" + } + ], + "conditional_formats": [ + { + "comparator": ">", + "value": 0, + "palette": "black_on_light_red" + } + ] + } + ], + "autoscale": false, + "precision": 2, + "timeseries_background": { + "yaxis": { + "include_zero": true + }, + "type": "area" + } + }, + "layout": { + "x": 0, + "y": 6, + "width": 4, + "height": 3 + } + }, + { + "id": 2813049649778638, + "definition": { + "title": "Missed Catch-Up Windows Rate over Time", + "title_size": "16", + "title_align": "left", + "show_legend": true, + "legend_layout": "auto", + "legend_columns": [ + "avg", + "min", + "max", + "value", + "sum" + ], + "type": "timeseries", + "requests": [ + { + "formulas": [ + { + "formula": "query1" + } + ], + "queries": [ + { + "name": "query1", + "data_source": "metrics", + "query": "sum:temporal_cloud.cloud_metrics.v0_schedule_missed_catchup_window_increase1m{$Namespace} by {temporal_namespace}.as_rate()" + } + ], + "response_format": "timeseries", + "style": { + "palette": "dog_classic", + "order_by": "values", + "line_type": "solid", + "line_width": "normal" + }, + "display_type": "line" + } + ] + }, + "layout": { + "x": 4, + "y": 6, + "width": 8, + "height": 3 + } + }, + { + "id": 5931318482078592, + "definition": { + "title": "Avg Rate-Limited Workflow Rate", + "title_size": "16", + "title_align": "left", + "time": {}, + "type": "query_value", + "requests": [ + { + "formulas": [ + { + "formula": "query1" + } + ], + "queries": [ + { + "name": "query1", + "data_source": "metrics", + "query": "sum:temporal_cloud.cloud_metrics.v0_schedule_rate_limited_increase1m{$Namespace}.as_rate()", + "aggregator": "avg" + } + ], + "response_format": "scalar", + "conditional_formats": [ + { + "comparator": ">", + "value": 0, + "palette": "black_on_light_red" + } + ] + } + ], + "autoscale": false, + "precision": 2, + "timeseries_background": { + "yaxis": { + "include_zero": true + }, + "type": "area" + } + }, + "layout": { + "x": 0, + "y": 9, + "width": 4, + "height": 3 + } + }, + { + "id": 8093326950057642, + "definition": { + "title": "Rate-Limited Workflow Rate over Time", + "title_size": "16", + "title_align": "left", + "show_legend": true, + "legend_layout": "auto", + "legend_columns": [ + "avg", + "min", + "max", + "value", + "sum" + ], + "type": "timeseries", + "requests": [ + { + "formulas": [ + { + "formula": "query1" + } + ], + "queries": [ + { + "name": "query1", + "data_source": "metrics", + "query": "sum:temporal_cloud.cloud_metrics.v0_schedule_rate_limited_increase1m{$Namespace} by {temporal_namespace}.as_rate()" + } + ], + "response_format": "timeseries", + "style": { + "palette": "dog_classic", + "order_by": "values", + "line_type": "solid", + "line_width": "normal" + }, + "display_type": "line" + } + ] + }, + "layout": { + "x": 4, + "y": 9, + "width": 8, + "height": 3 + } + } + ] + }, + "layout": { + "x": 0, + "y": 8, + "width": 12, + "height": 1 + } + }, + { + "id": 5070840369040136, + "definition": { + "title": "Workflow Overview", + "background_color": "vivid_blue", + "show_title": true, + "type": "group", + "layout_type": "ordered", + "widgets": [ + { + "id": 7390564078879392, + "definition": { + "title": "Avg Workflow Cancellation Rate", + "title_size": "16", + "title_align": "left", + "type": "query_value", + "requests": [ + { + "response_format": "scalar", + "queries": [ + { + "name": "query1", + "data_source": "metrics", + "query": "sum:temporal_cloud.cloud_metrics.v0_workflow_cancel_increase1m{$Namespace}.as_rate()", + "aggregator": "avg" + } + ], + "formulas": [ + { + "formula": "query1" + } + ], + "conditional_formats": [ + { + "comparator": ">", + "value": 0, + "palette": "black_on_light_red" + } + ] + } + ], + "autoscale": false, + "precision": 2, + "timeseries_background": { + "yaxis": { + "include_zero": true + }, + "type": "area" + } + }, + "layout": { + "x": 0, + "y": 0, + "width": 4, + "height": 3 + } + }, + { + "id": 1229883020208502, + "definition": { + "title": "Workflow Cancellation Rate over Time", + "title_size": "16", + "title_align": "left", + "show_legend": true, + "legend_layout": "auto", + "legend_columns": [ + "avg", + "min", + "max", + "value", + "sum" + ], + "type": "timeseries", + "requests": [ + { + "formulas": [ + { + "formula": "query1" + } + ], + "queries": [ + { + "name": "query1", + "data_source": "metrics", + "query": "sum:temporal_cloud.cloud_metrics.v0_workflow_cancel_increase1m{$Namespace} by {temporal_namespace}.as_rate()" + } + ], + "response_format": "timeseries", + "style": { + "palette": "dog_classic", + "order_by": "values", + "line_type": "solid", + "line_width": "normal" + }, + "display_type": "line" + } + ] + }, + "layout": { + "x": 4, + "y": 0, + "width": 8, + "height": 3 + } + }, + { + "id": 2309946839235418, + "definition": { + "title": "Avg Continued-As-New Workflow Rate", + "title_size": "16", + "title_align": "left", + "time": {}, + "type": "query_value", + "requests": [ + { + "response_format": "scalar", + "queries": [ + { + "name": "query1", + "data_source": "metrics", + "query": "sum:temporal_cloud.cloud_metrics.v0_workflow_continued_as_new_increase1m{$Namespace}.as_rate()", + "aggregator": "avg" + } + ], + "formulas": [ + { + "formula": "query1" + } + ], + "conditional_formats": [ + { + "comparator": ">", + "value": 0, + "palette": "custom_bg", + "custom_bg_color": "#65a8e6" + } + ] + } + ], + "autoscale": false, + "precision": 2, + "timeseries_background": { + "yaxis": { + "include_zero": true + }, + "type": "area" + } + }, + "layout": { + "x": 0, + "y": 3, + "width": 4, + "height": 3 + } + }, + { + "id": 38538954247440, + "definition": { + "title": "Continued-As-New Workflow Rate over Time", + "title_size": "16", + "title_align": "left", + "show_legend": true, + "legend_layout": "auto", + "legend_columns": [ + "avg", + "min", + "max", + "value", + "sum" + ], + "type": "timeseries", + "requests": [ + { + "formulas": [ + { + "formula": "query1" + } + ], + "queries": [ + { + "name": "query1", + "data_source": "metrics", + "query": "sum:temporal_cloud.cloud_metrics.v0_workflow_continued_as_new_increase1m{$Namespace} by {temporal_namespace}.as_rate()" + } + ], + "response_format": "timeseries", + "style": { + "palette": "dog_classic", + "order_by": "values", + "line_type": "solid", + "line_width": "normal" + }, + "display_type": "line" + } + ] + }, + "layout": { + "x": 4, + "y": 3, + "width": 8, + "height": 3 + } + }, + { + "id": 2741121569755672, + "definition": { + "title": "Avg Workflow Failure Rate", + "title_size": "16", + "title_align": "left", + "type": "query_value", + "requests": [ + { + "response_format": "scalar", + "queries": [ + { + "name": "query1", + "data_source": "metrics", + "query": "sum:temporal_cloud.cloud_metrics.v0_workflow_failed_increase1m{$Namespace}.as_rate()", + "aggregator": "avg" + } + ], + "formulas": [ + { + "formula": "query1" + } + ], + "conditional_formats": [ + { + "comparator": ">", + "value": 0, + "palette": "black_on_light_red" + } + ] + } + ], + "autoscale": false, + "precision": 2, + "timeseries_background": { + "yaxis": { + "include_zero": true + }, + "type": "area" + } + }, + "layout": { + "x": 0, + "y": 6, + "width": 4, + "height": 3 + } + }, + { + "id": 4128572692652006, + "definition": { + "title": "Workflow Failure Rate over Time ", + "title_size": "16", + "title_align": "left", + "show_legend": true, + "legend_layout": "auto", + "legend_columns": [ + "avg", + "min", + "max", + "value", + "sum" + ], + "type": "timeseries", + "requests": [ + { + "formulas": [ + { + "formula": "query1" + } + ], + "queries": [ + { + "name": "query1", + "data_source": "metrics", + "query": "sum:temporal_cloud.cloud_metrics.v0_workflow_failed_increase1m{$Namespace} by {temporal_namespace}.as_rate()" + } + ], + "response_format": "timeseries", + "style": { + "palette": "dog_classic", + "order_by": "values", + "line_type": "solid", + "line_width": "normal" + }, + "display_type": "line" + } + ] + }, + "layout": { + "x": 4, + "y": 6, + "width": 8, + "height": 3 + } + }, + { + "id": 5478455287138638, + "definition": { + "title": "Avg Workflow Success Rate", + "title_size": "16", + "title_align": "left", + "type": "query_value", + "requests": [ + { + "response_format": "scalar", + "queries": [ + { + "name": "query1", + "data_source": "metrics", + "query": "sum:temporal_cloud.cloud_metrics.v0_workflow_success_increase1m{$Namespace}.as_rate()", + "aggregator": "avg" + } + ], + "formulas": [ + { + "formula": "query1" + } + ], + "conditional_formats": [ + { + "comparator": ">", + "value": 0, + "palette": "black_on_light_green" + } + ] + } + ], + "autoscale": false, + "precision": 2, + "timeseries_background": { + "yaxis": { + "include_zero": true + }, + "type": "area" + } + }, + "layout": { + "x": 0, + "y": 9, + "width": 4, + "height": 3 + } + }, + { + "id": 2007567121576290, + "definition": { + "title": "Workflow Success Rate over Time ", + "title_size": "16", + "title_align": "left", + "show_legend": true, + "legend_layout": "auto", + "legend_columns": [ + "avg", + "min", + "max", + "value", + "sum" + ], + "type": "timeseries", + "requests": [ + { + "formulas": [ + { + "formula": "query1" + } + ], + "queries": [ + { + "name": "query1", + "data_source": "metrics", + "query": "sum:temporal_cloud.cloud_metrics.v0_workflow_success_increase1m{$Namespace} by {temporal_namespace}.as_rate()" + } + ], + "response_format": "timeseries", + "style": { + "palette": "dog_classic", + "order_by": "values", + "line_type": "solid", + "line_width": "normal" + }, + "display_type": "line" + } + ] + }, + "layout": { + "x": 4, + "y": 9, + "width": 8, + "height": 3 + } + }, + { + "id": 6862787917948638, + "definition": { + "title": "Avg Workflow Termination Rate", + "title_size": "16", + "title_align": "left", + "type": "query_value", + "requests": [ + { + "response_format": "scalar", + "queries": [ + { + "name": "query1", + "data_source": "metrics", + "query": "sum:temporal_cloud.cloud_metrics.v0_workflow_terminate_increase1m{$Namespace}.as_rate()", + "aggregator": "avg" + } + ], + "formulas": [ + { + "formula": "query1" + } + ], + "conditional_formats": [ + { + "comparator": ">", + "value": 0, + "palette": "black_on_light_red" + } + ] + } + ], + "autoscale": false, + "precision": 2, + "timeseries_background": { + "yaxis": { + "include_zero": true + }, + "type": "area" + } + }, + "layout": { + "x": 0, + "y": 12, + "width": 4, + "height": 3 + } + }, + { + "id": 3869465964272090, + "definition": { + "title": "Workflow Termination Rate over Time ", + "title_size": "16", + "title_align": "left", + "show_legend": true, + "legend_layout": "auto", + "legend_columns": [ + "avg", + "min", + "max", + "value", + "sum" + ], + "type": "timeseries", + "requests": [ + { + "formulas": [ + { + "formula": "query1" + } + ], + "queries": [ + { + "name": "query1", + "data_source": "metrics", + "query": "sum:temporal_cloud.cloud_metrics.v0_workflow_terminate_increase1m{$Namespace} by {temporal_namespace}.as_rate()" + } + ], + "response_format": "timeseries", + "style": { + "palette": "dog_classic", + "order_by": "values", + "line_type": "solid", + "line_width": "normal" + }, + "display_type": "line" + } + ] + }, + "layout": { + "x": 4, + "y": 12, + "width": 8, + "height": 3 + } + }, + { + "id": 1056987583365914, + "definition": { + "title": "Avg Workflow Timeout Rate", + "title_size": "16", + "title_align": "left", + "time": {}, + "type": "query_value", + "requests": [ + { + "response_format": "scalar", + "queries": [ + { + "name": "query1", + "data_source": "metrics", + "query": "sum:temporal_cloud.cloud_metrics.v0_workflow_timeout_increase1m{$Namespace}.as_rate()", + "aggregator": "avg" + } + ], + "formulas": [ + { + "formula": "query1" + } + ], + "conditional_formats": [ + { + "comparator": ">", + "value": 0, + "palette": "black_on_light_red" + } + ] + } + ], + "autoscale": false, + "precision": 2, + "timeseries_background": { + "yaxis": { + "include_zero": true + }, + "type": "area" + } + }, + "layout": { + "x": 0, + "y": 15, + "width": 4, + "height": 3 + } + }, + { + "id": 5172304773900428, + "definition": { + "title": "Workflow Timeout Rate over Time", + "title_size": "16", + "title_align": "left", + "show_legend": true, + "legend_layout": "auto", + "legend_columns": [ + "avg", + "min", + "max", + "value", + "sum" + ], + "type": "timeseries", + "requests": [ + { + "formulas": [ + { + "formula": "query1" + } + ], + "queries": [ + { + "name": "query1", + "data_source": "metrics", + "query": "sum:temporal_cloud.cloud_metrics.v0_workflow_timeout_increase1m{$Namespace} by {temporal_namespace}.as_rate()" + } + ], + "response_format": "timeseries", + "style": { + "palette": "dog_classic", + "order_by": "values", + "line_type": "solid", + "line_width": "normal" + }, + "display_type": "line" + } + ] + }, + "layout": { + "x": 4, + "y": 15, + "width": 8, + "height": 3 + } + } + ] + }, + "layout": { + "x": 0, + "y": 9, + "width": 12, + "height": 1 + } + }, + { + "id": 2643743209353550, + "definition": { + "title": "Replication Lag Overview", + "background_color": "vivid_blue", + "show_title": true, + "type": "group", + "layout_type": "ordered", + "widgets": [ + { + "id": 4988540215976294, + "definition": { + "title": "Avg Replication Lag (P50)", + "title_size": "16", + "title_align": "left", + "type": "query_value", + "requests": [ + { + "response_format": "scalar", + "queries": [ + { + "data_source": "metrics", + "name": "query1", + "query": "avg:temporal_cloud.cloud_metrics.v0_replication_lag_p50{$Namespace}", + "aggregator": "avg" + } + ], + "formulas": [ + { + "number_format": { + "unit": { + "type": "canonical_unit", + "unit_name": "second" + } + }, + "formula": "query1" + } + ], + "conditional_formats": [ + { + "comparator": "<=", + "value": 0.1, + "palette": "black_on_light_green" + }, + { + "comparator": ">", + "value": 0.1, + "palette": "black_on_light_yellow" + }, + { + "comparator": ">", + "value": 0.2, + "palette": "black_on_light_red" + } + ] + } + ], + "autoscale": false, + "precision": 2, + "timeseries_background": { + "type": "area" + } + }, + "layout": { + "x": 0, + "y": 0, + "width": 4, + "height": 3 + } + }, + { + "id": 7728657278450226, + "definition": { + "title": "Replication Lag (P50) over Time", + "title_size": "16", + "title_align": "left", + "show_legend": true, + "legend_layout": "auto", + "legend_columns": [ + "avg", + "min", + "max", + "value", + "sum" + ], + "time": {}, + "type": "timeseries", + "requests": [ + { + "formulas": [ + { + "formula": "query1" + } + ], + "queries": [ + { + "data_source": "metrics", + "name": "query1", + "query": "avg:temporal_cloud.cloud_metrics.v0_replication_lag_p50{$Namespace} by {temporal_namespace}" + } + ], + "response_format": "timeseries", + "style": { + "palette": "dog_classic", + "order_by": "values", + "line_type": "solid", + "line_width": "normal" + }, + "display_type": "line" + } + ] + }, + "layout": { + "x": 4, + "y": 0, + "width": 8, + "height": 3 + } + }, + { + "id": 6760508313946898, + "definition": { + "title": "Avg Replication Lag (P90)", + "title_size": "16", + "title_align": "left", + "type": "query_value", + "requests": [ + { + "response_format": "scalar", + "queries": [ + { + "data_source": "metrics", + "name": "query1", + "query": "avg:temporal_cloud.cloud_metrics.v0_replication_lag_p90{$Namespace}", + "aggregator": "avg" + } + ], + "formulas": [ + { + "number_format": { + "unit": { + "type": "canonical_unit", + "unit_name": "second" + } + }, + "formula": "query1" + } + ], + "conditional_formats": [ + { + "comparator": "<=", + "value": 0.1, + "palette": "black_on_light_green" + }, + { + "comparator": ">", + "value": 0.1, + "palette": "black_on_light_yellow" + }, + { + "comparator": ">", + "value": 0.2, + "palette": "black_on_light_red" + } + ] + } + ], + "autoscale": false, + "precision": 2, + "timeseries_background": { + "type": "area" + } + }, + "layout": { + "x": 0, + "y": 3, + "width": 4, + "height": 3 + } + }, + { + "id": 411485780589910, + "definition": { + "title": "Replication Lag (P90) over Time", + "title_size": "16", + "title_align": "left", + "show_legend": true, + "legend_layout": "auto", + "legend_columns": [ + "avg", + "min", + "max", + "value", + "sum" + ], + "time": {}, + "type": "timeseries", + "requests": [ + { + "formulas": [ + { + "formula": "query1" + } + ], + "queries": [ + { + "data_source": "metrics", + "name": "query1", + "query": "avg:temporal_cloud.cloud_metrics.v0_replication_lag_p90{$Namespace} by {temporal_namespace}" + } + ], + "response_format": "timeseries", + "style": { + "palette": "dog_classic", + "order_by": "values", + "line_type": "solid", + "line_width": "normal" + }, + "display_type": "line" + } + ] + }, + "layout": { + "x": 4, + "y": 3, + "width": 8, + "height": 3 + } + }, + { + "id": 1521266053652292, + "definition": { + "title": "Avg Replication Lag (P95)", + "title_size": "16", + "title_align": "left", + "type": "query_value", + "requests": [ + { + "response_format": "scalar", + "queries": [ + { + "data_source": "metrics", + "name": "query1", + "query": "avg:temporal_cloud.cloud_metrics.v0_replication_lag_p95{$Namespace}", + "aggregator": "avg" + } + ], + "formulas": [ + { + "number_format": { + "unit": { + "type": "canonical_unit", + "unit_name": "second" + } + }, + "formula": "query1" + } + ], + "conditional_formats": [ + { + "comparator": "<=", + "value": 0.1, + "palette": "black_on_light_green" + }, + { + "comparator": ">", + "value": 0.1, + "palette": "black_on_light_yellow" + }, + { + "comparator": ">", + "value": 0.2, + "palette": "black_on_light_red" + } + ] + } + ], + "autoscale": false, + "precision": 2, + "timeseries_background": { + "type": "area" + } + }, + "layout": { + "x": 0, + "y": 6, + "width": 4, + "height": 3 + } + }, + { + "id": 7111826559326150, + "definition": { + "title": "Replication Lag (P95) over Time", + "title_size": "16", + "title_align": "left", + "show_legend": true, + "legend_layout": "auto", + "legend_columns": [ + "avg", + "min", + "max", + "value", + "sum" + ], + "type": "timeseries", + "requests": [ + { + "formulas": [ + { + "formula": "query1" + } + ], + "queries": [ + { + "data_source": "metrics", + "name": "query1", + "query": "avg:temporal_cloud.cloud_metrics.v0_replication_lag_p95{$Namespace} by {temporal_namespace}" + } + ], + "response_format": "timeseries", + "style": { + "palette": "dog_classic", + "order_by": "values", + "line_type": "solid", + "line_width": "normal" + }, + "display_type": "line" + } + ] + }, + "layout": { + "x": 4, + "y": 6, + "width": 8, + "height": 3 + } + }, + { + "id": 1857933666822954, + "definition": { + "title": "Avg Replication Lag (P99)", + "title_size": "16", + "title_align": "left", + "type": "query_value", + "requests": [ + { + "response_format": "scalar", + "queries": [ + { + "data_source": "metrics", + "name": "query1", + "query": "avg:temporal_cloud.cloud_metrics.v0_replication_lag_p99{$Namespace}", + "aggregator": "avg" + } + ], + "formulas": [ + { + "number_format": { + "unit": { + "type": "canonical_unit", + "unit_name": "second" + } + }, + "formula": "query1" + } + ], + "conditional_formats": [ + { + "comparator": "<=", + "value": 0.1, + "palette": "black_on_light_green" + }, + { + "comparator": ">", + "value": 0.1, + "palette": "black_on_light_yellow" + }, + { + "comparator": ">", + "value": 0.2, + "palette": "black_on_light_red" + } + ] + } + ], + "autoscale": false, + "precision": 2, + "timeseries_background": { + "type": "area" + } + }, + "layout": { + "x": 0, + "y": 9, + "width": 4, + "height": 3 + } + }, + { + "id": 1978734770597192, + "definition": { + "title": "Replication Lag (P99) over Time", + "title_size": "16", + "title_align": "left", + "show_legend": true, + "legend_layout": "auto", + "legend_columns": [ + "avg", + "min", + "max", + "value", + "sum" + ], + "type": "timeseries", + "requests": [ + { + "formulas": [ + { + "formula": "query1" + } + ], + "queries": [ + { + "data_source": "metrics", + "name": "query1", + "query": "avg:temporal_cloud.cloud_metrics.v0_replication_lag_p99{$Namespace} by {temporal_namespace}" + } + ], + "response_format": "timeseries", + "style": { + "palette": "dog_classic", + "order_by": "values", + "line_type": "solid", + "line_width": "normal" + }, + "display_type": "line" + } + ] + }, + "layout": { + "x": 4, + "y": 9, + "width": 8, + "height": 3 + } + } + ] + }, + "layout": { + "x": 0, + "y": 10, + "width": 12, + "height": 1 + } + } + ], + "template_variables": [ + { + "name": "Namespace", + "prefix": "temporal_namespace", + "available_values": [], + "default": "*" + } + ], + "layout_type": "ordered", + "notify_list": [], + "reflow_type": "fixed" +} diff --git a/temporal_cloud/assets/monitors/high_grpc_error_percentage.json b/temporal_cloud/assets/monitors/high_grpc_error_percentage.json new file mode 100644 index 0000000000000..8415371395d7b --- /dev/null +++ b/temporal_cloud/assets/monitors/high_grpc_error_percentage.json @@ -0,0 +1,35 @@ +{ + "version": 2, + "created_at": "2024-12-10", + "last_updated_at": "2024-12-10", + "title": "High gRPC error percentage", + "description": "This monitor alerts when the percentage of gRPC errors exceeds the defined threshold for your Temporal Cloud instance, indicating potential issues with service communication that could impact workflow executions and overall system reliability.", + "definition": { + "id": 159196278, + "name": "High gRPC error percentage", + "type": "query alert", + "query": "avg(last_5m):(sum:temporal_cloud.cloud_metrics.v0_frontend_service_error_increase1m{*} by {temporal_namespace,operation}.as_rate() / sum:temporal_cloud.cloud_metrics.v0_frontend_service_request_increase1m{*} by {temporal_namespace,operation}.as_rate()) * 100 > 10", + "message": "{{#is_warning}}\nThe gRPC error percentage for Temporal Cloud operation: **{{operation.name}}** in namespace: **{{temporal_namespace.name}}** has exceeded the warning threshold.\nCurrent error percentage: **{{value}}%**\nThreshold: {{warn_threshold}}%\n{{/is_warning}}\n\n{{#is_alert}}\nThe gRPC error percentage for Temporal Cloud operation: **{{operation.name}}** in namespace: **{{temporal_namespace.name}}** has exceeded the alert threshold.\nCurrent error percentage: **{{value}}%**\nThreshold: {{threshold}}%\n{{/is_alert}}\n\n@example@example.com", + "tags": [ + "integration:temporal-cloud" + ], + "options": { + "thresholds": { + "critical": 10, + "warning": 5 + }, + "notify_audit": false, + "on_missing_data": "show_no_data", + "include_tags": false, + "new_group_delay": 0, + "silenced": {} + }, + "priority": 1, + "restriction_policy": { + "bindings": [] + } + }, + "tags": [ + "integration:temporal-cloud" + ] +} diff --git a/temporal_cloud/assets/monitors/high_service_latency.json b/temporal_cloud/assets/monitors/high_service_latency.json new file mode 100644 index 0000000000000..7de6684cb62e8 --- /dev/null +++ b/temporal_cloud/assets/monitors/high_service_latency.json @@ -0,0 +1,35 @@ +{ + "version": 2, + "created_at": "2024-12-10", + "last_updated_at": "2024-12-10", + "title": "High service latency", + "description": "This monitor alerts when the 99th percentile service latency exceeds the defined threshold for your Temporal Cloud instance, indicating potential performance degradation that could impact workflow execution times and overall system responsiveness.", + "definition": { + "id": 160133052, + "name": "High service latency", + "type": "query alert", + "query": "avg(last_5m):avg:temporal_cloud.cloud_metrics.v0_service_latency_p99{*} by {temporal_namespace,operation} > 0.2", + "message": "{{#is_warning}}\nThe P99 service latency for Temporal Cloud operation: **{{operation.name}}** in namespace: **{{temporal_namespace.name}}** has exceeded the warning threshold.\nCurrent Service Latency (P99): **{{value}} seconds**\nThreshold: {{warn_threshold}} seconds\n{{/is_warning}}\n\n{{#is_alert}}\nThe P99 service latency for Temporal Cloud operation: **{{operation.name}}** in namespace: **{{temporal_namespace.name}}** has exceeded the alert threshold.\nCurrent Service Latency (P99): **{{value}} seconds**\nThreshold: {{threshold}} seconds\n{{/is_alert}}\n\n@example@example.com", + "tags": [ + "integration:temporal-cloud" + ], + "options": { + "thresholds": { + "critical": 0.2, + "warning": 0.1 + }, + "notify_audit": false, + "on_missing_data": "show_no_data", + "include_tags": false, + "new_group_delay": 60, + "silenced": {} + }, + "priority": 2, + "restriction_policy": { + "bindings": [] + } + }, + "tags": [ + "integration:temporal-cloud" + ] +} diff --git a/temporal_cloud/assets/temporal_cloud.svg b/temporal_cloud/assets/temporal_cloud.svg new file mode 100644 index 0000000000000..d152fae7bb724 --- /dev/null +++ b/temporal_cloud/assets/temporal_cloud.svg @@ -0,0 +1,11 @@ + + + + + + + + + + + diff --git a/temporal_cloud/images/temporal_cloud_overview_1.png b/temporal_cloud/images/temporal_cloud_overview_1.png new file mode 100644 index 0000000000000..e8a7187f37ae5 Binary files /dev/null and b/temporal_cloud/images/temporal_cloud_overview_1.png differ diff --git a/temporal_cloud/images/temporal_cloud_overview_2.png b/temporal_cloud/images/temporal_cloud_overview_2.png new file mode 100644 index 0000000000000..3f9977dcfcf82 Binary files /dev/null and b/temporal_cloud/images/temporal_cloud_overview_2.png differ diff --git a/temporal_cloud/images/temporal_cloud_overview_3.png b/temporal_cloud/images/temporal_cloud_overview_3.png new file mode 100644 index 0000000000000..cd08c34104b58 Binary files /dev/null and b/temporal_cloud/images/temporal_cloud_overview_3.png differ diff --git a/temporal_cloud/images/temporal_cloud_overview_4.png b/temporal_cloud/images/temporal_cloud_overview_4.png new file mode 100644 index 0000000000000..112cdb093414f Binary files /dev/null and b/temporal_cloud/images/temporal_cloud_overview_4.png differ diff --git a/temporal_cloud/images/temporal_cloud_overview_5.png b/temporal_cloud/images/temporal_cloud_overview_5.png new file mode 100644 index 0000000000000..1462ff04449a8 Binary files /dev/null and b/temporal_cloud/images/temporal_cloud_overview_5.png differ diff --git a/temporal_cloud/manifest.json b/temporal_cloud/manifest.json index 6a777edc5ab90..d8ffd57a41b10 100644 --- a/temporal_cloud/manifest.json +++ b/temporal_cloud/manifest.json @@ -8,10 +8,38 @@ "configuration": "README.md#Setup", "support": "README.md#Support", "changelog": "CHANGELOG.md", - "description": "", + "description": "Gain insights into system health, workflow efficiency, task execution and performance bottlenecks for your instance.", "title": "Temporal Cloud", - "media": [], + "media": [ + { + "caption": "Temporal Cloud - Overview 1", + "image_url": "images/temporal_cloud_overview_1.png", + "media_type": "image" + }, + { + "caption": "Temporal Cloud - Overview 2", + "image_url": "images/temporal_cloud_overview_2.png", + "media_type": "image" + }, + { + "caption": "Temporal Cloud - Overview 3", + "image_url": "images/temporal_cloud_overview_3.png", + "media_type": "image" + }, + { + "caption": "Temporal Cloud - Overview 4", + "image_url": "images/temporal_cloud_overview_4.png", + "media_type": "image" + }, + { + "caption": "Temporal Cloud - Overview 5", + "image_url": "images/temporal_cloud_overview_5.png", + "media_type": "image" + } + ], "classifier_tags": [ + "Category::Cloud", + "Category::Developer Tools", "Category::Metrics", "Offering::Integration", "Submitted Data Type::Metrics" @@ -27,12 +55,19 @@ }, "metrics": { "prefix": "temporal_cloud.", - "check": [], + "check": "temporal_cloud.cloud_metrics.v0_frontend_service_request_increase1m", "metadata_path": "metadata.csv" }, "service_checks": { "metadata_path": "assets/service_checks.json" } + }, + "dashboards": { + "Temporal Cloud - Overview": "assets/dashboards/temporal_cloud_overview.json" + }, + "monitors": { + "High gRPC error percentage": "assets/monitors/high_grpc_error_percentage.json", + "High service latency": "assets/monitors/high_service_latency.json" } }, "author": { diff --git a/temporal_cloud/metadata.csv b/temporal_cloud/metadata.csv index 02cde5e98381e..036ddb89057a4 100644 --- a/temporal_cloud/metadata.csv +++ b/temporal_cloud/metadata.csv @@ -1 +1,27 @@ metric_name,metric_type,interval,unit_name,per_unit_name,description,orientation,integration,short_name,curated_metric,sample_tags +temporal_cloud.cloud_metrics.v0_frontend_service_error_increase1m,count,60,,,Increase in gRPC errors,-1,temporal_cloud,Frontend Service Error,, +temporal_cloud.cloud_metrics.v0_frontend_service_request_increase1m,count,60,,,Increase in gRPC requests received,0,temporal_cloud,Frontend Service Request,, +temporal_cloud.cloud_metrics.v0_poll_success_increase1m,count,60,,,Increase in count tasks that are successfully matched to a poller,1,temporal_cloud,Poll Success,, +temporal_cloud.cloud_metrics.v0_poll_success_sync_increase1m,count,60,,,Increase in count tasks that are successfully sync matched to a poller,1,temporal_cloud,Poll Success Sync,, +temporal_cloud.cloud_metrics.v0_poll_timeout_increase1m,count,60,,,"When no tasks are available for a poller before timing out, this is increase in count of such tasks",-1,temporal_cloud,Poll Timeout,, +temporal_cloud.cloud_metrics.v0_replication_lag_p50,gauge,,second,,P50 value using histogram of replication lag during a specific time interval for a multi-region Namespace.,-1,temporal_cloud,Replication Lag P50,, +temporal_cloud.cloud_metrics.v0_replication_lag_p90,gauge,,second,,P90 value using histogram of replication lag during a specific time interval for a multi-region Namespace.,-1,temporal_cloud,Replication Lag P90,, +temporal_cloud.cloud_metrics.v0_replication_lag_p95,gauge,,second,,P95 value using histogram of replication lag during a specific time interval for a multi-region Namespace.,-1,temporal_cloud,Replication Lag P95,, +temporal_cloud.cloud_metrics.v0_replication_lag_p99,gauge,,second,,P99 value using histogram of replication lag during a specific time interval for a multi-region Namespace.,-1,temporal_cloud,Replication Lag P59,, +temporal_cloud.cloud_metrics.v0_resource_exhausted_error_increase1m,count,60,,,Increase in gRPC requests received that were rate-limited,-1,temporal_cloud,Resource Exhausted Error,, +temporal_cloud.cloud_metrics.v0_schedule_action_success_increase1m,count,60,,,Increase in count of successful execution of a Scheduled Workflow.,1,temporal_cloud,Schedule Action Success,, +temporal_cloud.cloud_metrics.v0_schedule_buffer_overruns_increase1m,count,60,,,"When average schedule run length is greater than average schedule interval while a buffer_all overlap policy is configured, this is the increase in count of such scheduled workflow executions",-1,temporal_cloud,Schedule Buffer Overruns,, +temporal_cloud.cloud_metrics.v0_schedule_missed_catchup_window_increase1m,count,60,,,Increase in count of skipped Scheduled executions when Workflows were delayed longer than the catchup window.,-1,temporal_cloud,Schedule Missed Catchup Window,, +temporal_cloud.cloud_metrics.v0_schedule_rate_limited_increase1m,count,60,,,Increase in count of Scheduled Workflows that were delayed due to exceeding a rate limit.,-1,temporal_cloud,Schedule Rate Limited,, +temporal_cloud.cloud_metrics.v0_service_latency_p50,gauge,,second,,"P50 latency for SignalWithStartWorkflowExecution, SignalWorkflowExecution, StartWorkflowExecution operations.",-1,temporal_cloud,Service Latency P50,, +temporal_cloud.cloud_metrics.v0_service_latency_p90,gauge,,second,,"P90 latency for SignalWithStartWorkflowExecution, SignalWorkflowExecution, StartWorkflowExecution operations.",-1,temporal_cloud,Service Latency P90,, +temporal_cloud.cloud_metrics.v0_service_latency_p95,gauge,,second,,"P95 latency for SignalWithStartWorkflowExecution, SignalWorkflowExecution, StartWorkflowExecution operations.",-1,temporal_cloud,Service Latency P95,, +temporal_cloud.cloud_metrics.v0_service_latency_p99,gauge,,second,,"P99 latency for SignalWithStartWorkflowExecution, SignalWorkflowExecution, StartWorkflowExecution operations.",-1,temporal_cloud,Service Latency P99,, +temporal_cloud.cloud_metrics.v0_state_transition_increase1m,count,60,,,Increase in count of state transitions for each Namespace,0,temporal_cloud,State Transition,, +temporal_cloud.cloud_metrics.v0_total_action_increase1m,count,60,,,Increase in count of Temporal Cloud Actions,0,temporal_cloud,Total Action,, +temporal_cloud.cloud_metrics.v0_workflow_cancel_increase1m,count,60,,,Increase in count of Workflows canceled before completing execution.,-1,temporal_cloud,Workflow Cancel,, +temporal_cloud.cloud_metrics.v0_workflow_continued_as_new_increase1m,count,60,,,Increase in count of Workflow Executions that were Continued-As-New from a past execution.,0,temporal_cloud,Workflow Continued As New,, +temporal_cloud.cloud_metrics.v0_workflow_failed_increase1m,count,60,,,Increase in count of Workflows that failed before completion.,-1,temporal_cloud,Workflow Failed,, +temporal_cloud.cloud_metrics.v0_workflow_success_increase1m,count,60,,,Increase in count of Workflows that successfully completed.,1,temporal_cloud,Workflow Success,, +temporal_cloud.cloud_metrics.v0_workflow_terminate_increase1m,count,60,,,Increase in count of Workflows terminated before completing execution.,-1,temporal_cloud,Workflow Terminate,, +temporal_cloud.cloud_metrics.v0_workflow_timeout_increase1m,count,60,,,Increase in count of Workflows that timed out before completing execution.,-1,temporal_cloud,Workflow Timeout,,