From f4b94c2369b7b8a2c32a96136b324844ec0d3d93 Mon Sep 17 00:00:00 2001 From: Chris Helma Date: Tue, 19 Nov 2024 09:07:11 -0600 Subject: [PATCH 1/6] Added initial CloudWatch Dashboard for RFS Signed-off-by: Chris Helma --- .../reindex-from-snapshot-dashboard.json | 292 ++++++++++++++++++ .../reindex-from-snapshot-stack.ts | 34 ++ 2 files changed, 326 insertions(+) create mode 100644 deployment/cdk/opensearch-service-migration/lib/components/reindex-from-snapshot-dashboard.json diff --git a/deployment/cdk/opensearch-service-migration/lib/components/reindex-from-snapshot-dashboard.json b/deployment/cdk/opensearch-service-migration/lib/components/reindex-from-snapshot-dashboard.json new file mode 100644 index 000000000..9e50e13a3 --- /dev/null +++ b/deployment/cdk/opensearch-service-migration/lib/components/reindex-from-snapshot-dashboard.json @@ -0,0 +1,292 @@ +{ + "variables": [ + { + "type": "property", + "property": "region", + "inputType": "input", + "id": "REGION", + "label": "Region", + "defaultValue": "us-east-1", + "visible": false + }, + { + "type": "property", + "property": "DomainName", + "inputType": "input", + "id": "TC_DOMAIN_NAME", + "label": "Target Cluster Domain Name", + "defaultValue": "placeholder-name", + "visible": true + }, + { + "type": "pattern", + "pattern": "MA_STAGE", + "inputType": "input", + "id": "MA_STAGE", + "label": "Migration Assistant Stage", + "defaultValue": "placeholder-stage", + "visible": false + }, + { + "type": "pattern", + "pattern": "ACCOUNT_ID", + "inputType": "input", + "id": "ACCOUNT_ID", + "label": "ACCOUNT_ID", + "defaultValue": "ACCOUNT_ID", + "visible": false + } + ], + "widgets": [ + { + "height": 1, + "width": 24, + "y": 0, + "x": 0, + "type": "text", + "properties": { + "markdown": "# Target Cluster\n", + "background": "transparent" + } + }, + { + "height": 8, + "width": 12, + "y": 1, + "x": 0, + "type": "metric", + "properties": { + "view": "timeSeries", + "stacked": false, + "metrics": [ + [ { "expression": "METRICS()/1000", "id": "e1", "region": "REGION" } ], + [ "AWS/ES", "IndexingRate", "DomainName", "TC_DOMAIN_NAME", "ClientId", "ACCOUNT_ID", { "region": "region", "label": "Document Ingested per 60 seconds - MIN: ${MIN}, MAX: ${MAX}, AVG: ${AVG}", "id": "m1", "visible": false } ] + ], + "region": "REGION", + "title": "Target Cluster Document Index Rate", + "yAxis": { + "left": { + "label": "Thousands", + "showUnits": false + } + }, + "period": 60, + "stat": "Sum" + } + }, + { + "height": 8, + "width": 12, + "y": 1, + "x": 12, + "type": "metric", + "properties": { + "metrics": [ + [ { "expression": "METRICS()/1000", "id": "e1", "region": "REGION" } ], + [ "AWS/ES", "SearchableDocuments", "DomainName", "TC_DOMAIN_NAME", "ClientId", "ACCOUNT_ID", { "region": "REGION", "label": "SearchableDocuments - MIN: ${MIN}, MAX ${MAX}", "id": "m1", "visible": false } ] + ], + "view": "timeSeries", + "stacked": false, + "region": "REGION", + "title": "Target Cluster SearchableDocuments", + "period": 60, + "stat": "Average", + "yAxis": { + "left": { + "label": "Thousands", + "showUnits": false + } + } + } + }, + { + "height": 8, + "width": 12, + "y": 9, + "x": 0, + "type": "metric", + "properties": { + "metrics": [ + [ "AWS/ES", "4xx", "DomainName", "TC_DOMAIN_NAME", "ClientId", "ACCOUNT_ID", { "region": "REGION", "label": "4xx - ${SUM}" } ], + [ ".", "3xx", ".", ".", ".", ".", { "region": "REGION", "label": "3xx - ${SUM}" } ], + [ ".", "2xx", ".", ".", ".", ".", { "region": "REGION", "label": "2xx - ${SUM}" } ], + [ ".", "5xx", ".", ".", ".", ".", { "region": "REGION", "label": "5xx - ${SUM}" } ] + ], + "view": "timeSeries", + "stacked": false, + "region": "REGION", + "stat": "Sum", + "period": 300, + "title": "Target Cluster Status Codes (per 5 minutes)" + } + }, + { + "height": 8, + "width": 12, + "y": 9, + "x": 12, + "type": "metric", + "properties": { + "metrics": [ + [ "AWS/ES", "CPUUtilization", "DomainName", "TC_DOMAIN_NAME", "ClientId", "ACCOUNT_ID", { "stat": "Minimum", "label": "Min Data Node CPU Utilization", "color": "#2ca02c", "region": "REGION" } ], + [ "...", { "stat": "Maximum", "label": "Max Data Node CPU Utilization", "color": "#d62728", "region": "REGION" } ], + [ "...", { "stat": "Average", "label": "Avg Data Node CPU Utilization", "color": "#1f77b4", "region": "REGION" } ], + [ "AWS/ES", "MasterCPUUtilization", "DomainName", "TC_DOMAIN_NAME", "ClientId", "ACCOUNT_ID", { "stat": "Minimum", "label": "Min Master Node CPU Utilization", "color": "#98df8a", "region": "REGION" } ], + [ "...", { "stat": "Maximum", "label": "Max Master Node CPU Utilization", "color": "#ff9896", "region": "REGION" } ], + [ "...", { "stat": "Average", "label": "Avg Master Node CPU Utilization", "color": "#ff7f0e", "region": "REGION" } ] + ], + "view": "timeSeries", + "stacked": false, + "region": "REGION", + "title": "Target Cluster CPU Utilization by Node", + "period": 60, + "yAxis": { + "left": { + "label": "CPU Utilization (%)", + "min": 0, + "max": 100, + "showUnits": false + } + }, + "legend": { + "position": "bottom" + } + } + }, + { + "height": 8, + "width": 12, + "y": 17, + "x": 0, + "type": "metric", + "properties": { + "metrics": [ + [ { "expression": "METRICS()/1000", "label": "", "id": "e1", "region": "REGION" } ], + [ "AWS/ES", "ClusterUsedSpace", "DomainName", "TC_DOMAIN_NAME", "ClientId", "ACCOUNT_ID", { "id": "m1", "visible": false, "period": 60, "region": "REGION" } ] + ], + "view": "timeSeries", + "stacked": false, + "region": "REGION", + "title": "Target Cluster Used Space", + "period": 60, + "stat": "Average", + "yAxis": { + "left": { + "label": "GB", + "showUnits": false + } + } + } + }, + { + "height": 8, + "width": 12, + "y": 17, + "x": 12, + "type": "metric", + "properties": { + "metrics": [ + [ "AWS/ES", "ThroughputThrottle", "DomainName", "TC_DOMAIN_NAME", "ClientId", "ACCOUNT_ID", { "id": "m1", "period": 60, "region": "REGION" } ], + [ ".", "IopsThrottle", ".", ".", ".", ".", { "period": 60, "region": "REGION", "id": "m2" } ] + ], + "view": "timeSeries", + "stacked": false, + "region": "REGION", + "title": "Target Cluster EBS Throttling", + "period": 60, + "stat": "Average" + } + }, + { + "height": 1, + "width": 24, + "y": 25, + "x": 0, + "type": "text", + "properties": { + "markdown": "# Reindex-From-Snapshot Workers", + "background": "transparent" + } + }, + { + "height": 8, + "width": 12, + "y": 26, + "x": 0, + "type": "metric", + "properties": { + "metrics": [ + [ "OpenSearchMigrations", "bytesSent", "OTelLib", "documentMigration", { "region": "REGION", "label": "Bytes Sent - MIN - ${MIN}, MAX - ${MAX}, AVG - ${AVG}" } ] + ], + "period": 60, + "region": "REGION", + "stacked": false, + "title": "RFS Reindexing Traffic", + "view": "timeSeries", + "stat": "Sum", + "yAxis": { + "left": { + "label": "Bytes", + "showUnits": false + } + } + } + }, + { + "height": 8, + "width": 12, + "y": 26, + "x": 12, + "type": "metric", + "properties": { + "metrics": [ + [ "AWS/ECS", "CPUUtilization", "ServiceName", "migration-MA_STAGE-reindex-from-snapshot", "ClusterName", "migration-MA_STAGE-ecs-cluster", { "region": "REGION", "label": "RFS Workers - MIN - ${MIN}, MAX - ${MAX}, AVG - ${AVG}" } ] + ], + "period": 60, + "region": "REGION", + "stacked": false, + "title": "RFS Workers Reporting in During Period", + "view": "timeSeries", + "stat": "SampleCount" + } + }, + { + "height": 8, + "width": 12, + "y": 34, + "x": 0, + "type": "metric", + "properties": { + "metrics": [ + [ "AWS/ECS", "CPUUtilization", "ServiceName", "migration-MA_STAGE-reindex-from-snapshot", "ClusterName", "migration-MA_STAGE-ecs-cluster", { "stat": "Minimum", "region": "REGION", "color": "#2ca02c" } ], + [ "...", { "stat": "Average", "region": "REGION", "color": "#1f77b4" } ], + [ "...", { "stat": "Maximum", "region": "REGION", "color": "#d62728" } ] + ], + "period": 60, + "region": "REGION", + "stacked": false, + "title": "RFS CPU utilization", + "view": "timeSeries" + } + }, + { + "height": 8, + "width": 12, + "y": 34, + "x": 12, + "type": "metric", + "properties": { + "metrics": [ + [ "AWS/ECS", "MemoryUtilization", "ServiceName", "migration-MA_STAGE-reindex-from-snapshot", "ClusterName", "migration-MA_STAGE-ecs-cluster", { "stat": "Minimum", "region": "REGION", "color": "#2ca02c" } ], + [ "...", { "stat": "Average", "region": "REGION", "color": "#1f77b4" } ], + [ "...", { "stat": "Maximum", "region": "REGION", "color": "#d62728" } ] + ], + "period": 60, + "region": "REGION", + "stacked": false, + "title": "RFS Memory utilization", + "view": "timeSeries" + } + } + ] +} \ No newline at end of file diff --git a/deployment/cdk/opensearch-service-migration/lib/service-stacks/reindex-from-snapshot-stack.ts b/deployment/cdk/opensearch-service-migration/lib/service-stacks/reindex-from-snapshot-stack.ts index ff69e4683..a1dfe884d 100644 --- a/deployment/cdk/opensearch-service-migration/lib/service-stacks/reindex-from-snapshot-stack.ts +++ b/deployment/cdk/opensearch-service-migration/lib/service-stacks/reindex-from-snapshot-stack.ts @@ -21,8 +21,34 @@ import { import { RFSBackfillYaml, SnapshotYaml } from "../migration-services-yaml"; import { OtelCollectorSidecar } from "./migration-otel-collector-sidecar"; import { SharedLogFileSystem } from "../components/shared-log-file-system"; +import { CfnDashboard } from "aws-cdk-lib/aws-cloudwatch"; +import * as rfsDashboard from '../components/reindex-from-snapshot-dashboard.json'; +function setDefaultValueForVariable(variables: any[], variableName: string, defaultValue: string): any[] { + for (let i = 0; i < variables.length; i++) { + if (variables[i].id === variableName) { + variables[i].defaultValue = defaultValue; + console.log(`changing ${variables[i].defaultValue} to ${defaultValue}`) + break; + } + } + console.log(`returning ${JSON.stringify(variables)}`); + return variables; +} +function setAccountIdForDashboard(dashboardBody: any, account: string): any { + dashboardBody.variables = setDefaultValueForVariable(dashboardBody.variables, 'ACCOUNT_ID', account) + return dashboardBody; +} +function setRegionForDashboard(dashboardBody: any, region: string): any { + dashboardBody.variables = setDefaultValueForVariable(dashboardBody.variables, 'REGION', region) + return dashboardBody; +} +function setStageForDashboard(dashboardBody: any, stage: string): any { + dashboardBody.variables = setDefaultValueForVariable(dashboardBody.variables, 'MA_STAGE', stage) + return dashboardBody; +} + export interface ReindexFromSnapshotProps extends StackPropsExt { readonly vpc: IVpc, readonly fargateCpuArch: CpuArchitecture, @@ -191,6 +217,14 @@ export class ReindexFromSnapshotStack extends MigrationServiceCore { ...props }); + let dashboard = setAccountIdForDashboard(rfsDashboard, this.account) + dashboard = setRegionForDashboard(dashboard, this.region) + dashboard = setStageForDashboard(dashboard, props.stage) + new CfnDashboard(this, 'RFSDashboard', { + dashboardName: `MigrationAssistant_ReindexFromSnapshot_${props.stage}_Dashboard`, + dashboardBody: JSON.stringify(dashboard) + }); + this.rfsBackfillYaml = new RFSBackfillYaml(); this.rfsBackfillYaml.ecs.cluster_name = `migration-${props.stage}-ecs-cluster`; this.rfsBackfillYaml.ecs.service_name = `migration-${props.stage}-reindex-from-snapshot`; From 38a60928f4b4a222dd36cf5c9597060ca1df62ea Mon Sep 17 00:00:00 2001 From: Chris Helma Date: Tue, 19 Nov 2024 09:36:29 -0600 Subject: [PATCH 2/6] Fixed some linting issues in RFS CDK code Signed-off-by: Chris Helma --- .../reindex-from-snapshot-stack.ts | 58 +++++++++++-------- 1 file changed, 35 insertions(+), 23 deletions(-) diff --git a/deployment/cdk/opensearch-service-migration/lib/service-stacks/reindex-from-snapshot-stack.ts b/deployment/cdk/opensearch-service-migration/lib/service-stacks/reindex-from-snapshot-stack.ts index a1dfe884d..3a8ee6610 100644 --- a/deployment/cdk/opensearch-service-migration/lib/service-stacks/reindex-from-snapshot-stack.ts +++ b/deployment/cdk/opensearch-service-migration/lib/service-stacks/reindex-from-snapshot-stack.ts @@ -25,29 +25,41 @@ import { CfnDashboard } from "aws-cdk-lib/aws-cloudwatch"; import * as rfsDashboard from '../components/reindex-from-snapshot-dashboard.json'; -function setDefaultValueForVariable(variables: any[], variableName: string, defaultValue: string): any[] { - for (let i = 0; i < variables.length; i++) { - if (variables[i].id === variableName) { - variables[i].defaultValue = defaultValue; - console.log(`changing ${variables[i].defaultValue} to ${defaultValue}`) - break; - } - } - console.log(`returning ${JSON.stringify(variables)}`); - return variables; -} -function setAccountIdForDashboard(dashboardBody: any, account: string): any { - dashboardBody.variables = setDefaultValueForVariable(dashboardBody.variables, 'ACCOUNT_ID', account) - return dashboardBody; -} -function setRegionForDashboard(dashboardBody: any, region: string): any { - dashboardBody.variables = setDefaultValueForVariable(dashboardBody.variables, 'REGION', region) - return dashboardBody; -} -function setStageForDashboard(dashboardBody: any, stage: string): any { - dashboardBody.variables = setDefaultValueForVariable(dashboardBody.variables, 'MA_STAGE', stage) - return dashboardBody; -} +interface DashboardVariable { + id: string; + defaultValue: string; + } + + function setDefaultValueForVariable(variables: DashboardVariable[], variableName: string, defaultValue: string): DashboardVariable[] { + for (const variable of variables) { + if (variable.id === variableName) { + variable.defaultValue = defaultValue; + console.log(`changing ${variable.defaultValue} to ${defaultValue}`); + break; + } + } + console.log(`returning ${JSON.stringify(variables)}`); + return variables; + } + + interface DashboardBody { + variables: DashboardVariable[]; + } + + function setAccountIdForDashboard(dashboardBody: DashboardBody, account: string): DashboardBody { + dashboardBody.variables = setDefaultValueForVariable(dashboardBody.variables, 'ACCOUNT_ID', account); + return dashboardBody; + } + + function setRegionForDashboard(dashboardBody: DashboardBody, region: string): DashboardBody { + dashboardBody.variables = setDefaultValueForVariable(dashboardBody.variables, 'REGION', region); + return dashboardBody; + } + + function setStageForDashboard(dashboardBody: DashboardBody, stage: string): DashboardBody { + dashboardBody.variables = setDefaultValueForVariable(dashboardBody.variables, 'MA_STAGE', stage); + return dashboardBody; + } export interface ReindexFromSnapshotProps extends StackPropsExt { readonly vpc: IVpc, From 07d38f55b3a45827be5f3b82b96f9f877feb7fcc Mon Sep 17 00:00:00 2001 From: Chris Helma Date: Tue, 19 Nov 2024 10:27:40 -0600 Subject: [PATCH 3/6] Minor updates per PR comments Signed-off-by: Chris Helma --- .../lib/components/reindex-from-snapshot-dashboard.json | 4 ++-- .../lib/service-stacks/reindex-from-snapshot-stack.ts | 2 -- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/deployment/cdk/opensearch-service-migration/lib/components/reindex-from-snapshot-dashboard.json b/deployment/cdk/opensearch-service-migration/lib/components/reindex-from-snapshot-dashboard.json index 9e50e13a3..5c4e972e9 100644 --- a/deployment/cdk/opensearch-service-migration/lib/components/reindex-from-snapshot-dashboard.json +++ b/deployment/cdk/opensearch-service-migration/lib/components/reindex-from-snapshot-dashboard.json @@ -32,7 +32,7 @@ "pattern": "ACCOUNT_ID", "inputType": "input", "id": "ACCOUNT_ID", - "label": "ACCOUNT_ID", + "label": "Account ID", "defaultValue": "ACCOUNT_ID", "visible": false } @@ -99,7 +99,7 @@ } } }, - { + { "height": 8, "width": 12, "y": 9, diff --git a/deployment/cdk/opensearch-service-migration/lib/service-stacks/reindex-from-snapshot-stack.ts b/deployment/cdk/opensearch-service-migration/lib/service-stacks/reindex-from-snapshot-stack.ts index 3a8ee6610..38d6a9b01 100644 --- a/deployment/cdk/opensearch-service-migration/lib/service-stacks/reindex-from-snapshot-stack.ts +++ b/deployment/cdk/opensearch-service-migration/lib/service-stacks/reindex-from-snapshot-stack.ts @@ -34,11 +34,9 @@ interface DashboardVariable { for (const variable of variables) { if (variable.id === variableName) { variable.defaultValue = defaultValue; - console.log(`changing ${variable.defaultValue} to ${defaultValue}`); break; } } - console.log(`returning ${JSON.stringify(variables)}`); return variables; } From 0b1d394e2eb5c6133a33c990d3579718b4c63abd Mon Sep 17 00:00:00 2001 From: Chris Helma Date: Tue, 19 Nov 2024 12:57:26 -0600 Subject: [PATCH 4/6] Minor tweaks per team discussion Signed-off-by: Chris Helma --- .../lib/components/reindex-from-snapshot-dashboard.json | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/deployment/cdk/opensearch-service-migration/lib/components/reindex-from-snapshot-dashboard.json b/deployment/cdk/opensearch-service-migration/lib/components/reindex-from-snapshot-dashboard.json index 5c4e972e9..a5af825dc 100644 --- a/deployment/cdk/opensearch-service-migration/lib/components/reindex-from-snapshot-dashboard.json +++ b/deployment/cdk/opensearch-service-migration/lib/components/reindex-from-snapshot-dashboard.json @@ -240,7 +240,8 @@ "type": "metric", "properties": { "metrics": [ - [ "AWS/ECS", "CPUUtilization", "ServiceName", "migration-MA_STAGE-reindex-from-snapshot", "ClusterName", "migration-MA_STAGE-ecs-cluster", { "region": "REGION", "label": "RFS Workers - MIN - ${MIN}, MAX - ${MAX}, AVG - ${AVG}" } ] + [ { "expression": "METRICS()/PERIOD(m1)*60", "id" : "e1", "region": "REGION" } ], + [ "AWS/ECS", "CPUUtilization", "ServiceName", "migration-MA_STAGE-reindex-from-snapshot", "ClusterName", "migration-MA_STAGE-ecs-cluster", { "region": "REGION", "label": "RFS Workers - MIN - ${MIN}, MAX - ${MAX}, AVG - ${AVG}", "id": "m1", "visible": false } ] ], "period": 60, "region": "REGION", From 20b306c1deb02fd986cb6c81f1790b99e9b48180 Mon Sep 17 00:00:00 2001 From: Chris Helma Date: Tue, 19 Nov 2024 13:15:10 -0600 Subject: [PATCH 5/6] Added search for domain to RFS Dashboard Signed-off-by: Chris Helma --- .../lib/components/reindex-from-snapshot-dashboard.json | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/deployment/cdk/opensearch-service-migration/lib/components/reindex-from-snapshot-dashboard.json b/deployment/cdk/opensearch-service-migration/lib/components/reindex-from-snapshot-dashboard.json index a5af825dc..e7550c732 100644 --- a/deployment/cdk/opensearch-service-migration/lib/components/reindex-from-snapshot-dashboard.json +++ b/deployment/cdk/opensearch-service-migration/lib/components/reindex-from-snapshot-dashboard.json @@ -12,9 +12,11 @@ { "type": "property", "property": "DomainName", - "inputType": "input", + "inputType": "select", "id": "TC_DOMAIN_NAME", "label": "Target Cluster Domain Name", + "search": "{AWS/ES,ClientId,DomainName} MetricName=\"CPUUtilization\"", + "populateFrom": "DomainName", "defaultValue": "placeholder-name", "visible": true }, From c9df41bac7282ebcf6dd72547f851b15ef861896 Mon Sep 17 00:00:00 2001 From: Chris Helma Date: Tue, 19 Nov 2024 13:29:20 -0600 Subject: [PATCH 6/6] More updates per PR comments Signed-off-by: Chris Helma --- .../lib/components/reindex-from-snapshot-dashboard.json | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/deployment/cdk/opensearch-service-migration/lib/components/reindex-from-snapshot-dashboard.json b/deployment/cdk/opensearch-service-migration/lib/components/reindex-from-snapshot-dashboard.json index e7550c732..c498b9808 100644 --- a/deployment/cdk/opensearch-service-migration/lib/components/reindex-from-snapshot-dashboard.json +++ b/deployment/cdk/opensearch-service-migration/lib/components/reindex-from-snapshot-dashboard.json @@ -61,8 +61,8 @@ "view": "timeSeries", "stacked": false, "metrics": [ - [ { "expression": "METRICS()/1000", "id": "e1", "region": "REGION" } ], - [ "AWS/ES", "IndexingRate", "DomainName", "TC_DOMAIN_NAME", "ClientId", "ACCOUNT_ID", { "region": "region", "label": "Document Ingested per 60 seconds - MIN: ${MIN}, MAX: ${MAX}, AVG: ${AVG}", "id": "m1", "visible": false } ] + [ { "expression": "METRICS()/1000/PERIOD(m1)*60", "id": "e1", "region": "REGION" } ], + [ "AWS/ES", "IndexingRate", "DomainName", "TC_DOMAIN_NAME", "ClientId", "ACCOUNT_ID", { "region": "region", "label": "Document Ingested (included replicas) - MIN: ${MIN}, MAX: ${MAX}, AVG: ${AVG}", "id": "m1", "visible": false } ] ], "region": "REGION", "title": "Target Cluster Document Index Rate", @@ -248,7 +248,7 @@ "period": 60, "region": "REGION", "stacked": false, - "title": "RFS Workers Reporting in During Period", + "title": "RFS Workers Reporting In", "view": "timeSeries", "stat": "SampleCount" }