From 647f6014f7cc4a0b5dd956e532f73c920d462977 Mon Sep 17 00:00:00 2001 From: Chris Helma <25470211+chelma@users.noreply.github.com> Date: Wed, 2 Oct 2024 14:48:00 -0500 Subject: [PATCH] RFS Workers now use EBS to store/unpack the snapshot (#1041) * RFS Workers now use EBS to store/unpack the snapshot Signed-off-by: Chris Helma * Updated CDK unit test per changes Signed-off-by: Chris Helma * Updates per PR comments Signed-off-by: Chris Helma * More updates per PR comments Signed-off-by: Chris Helma * Fix unit test Signed-off-by: Chris Helma * Corrected RFS Snapshot Volume tag Signed-off-by: Chris Helma --------- Signed-off-by: Chris Helma --- .../cdk.context.json | 1 + .../lib/network-stack.ts | 2 +- .../service-stacks/migration-service-core.ts | 6 ++ .../reindex-from-snapshot-stack.ts | 66 ++++++++++++++++--- .../lib/stack-composer.ts | 4 +- .../opensearch-service-migration/options.md | 1 + .../test/reindex-from-snapshot-stack.test.ts | 11 ++-- 7 files changed, 76 insertions(+), 15 deletions(-) diff --git a/deployment/cdk/opensearch-service-migration/cdk.context.json b/deployment/cdk/opensearch-service-migration/cdk.context.json index 94d756654..85f305f3f 100644 --- a/deployment/cdk/opensearch-service-migration/cdk.context.json +++ b/deployment/cdk/opensearch-service-migration/cdk.context.json @@ -27,6 +27,7 @@ }, "vpcId": "", "reindexFromSnapshotServiceEnabled": true, + "reindexFromSnapshotMaxShardSizeGiB": 80, "trafficReplayerServiceEnabled": true } } diff --git a/deployment/cdk/opensearch-service-migration/lib/network-stack.ts b/deployment/cdk/opensearch-service-migration/lib/network-stack.ts index 61fef0b44..03f0d01ee 100644 --- a/deployment/cdk/opensearch-service-migration/lib/network-stack.ts +++ b/deployment/cdk/opensearch-service-migration/lib/network-stack.ts @@ -99,7 +99,7 @@ export class NetworkStack extends Stack { // General interface endpoints const interfaceEndpoints = [ InterfaceVpcEndpointAwsService.CLOUDWATCH_LOGS, // Push Logs from tasks - InterfaceVpcEndpointAwsService.CLOUDWATCH_MONITORING, // Pull Metrics from Migration Console + InterfaceVpcEndpointAwsService.CLOUDWATCH_MONITORING, // Pull Metrics from Migration Console InterfaceVpcEndpointAwsService.ECR_DOCKER, // Pull Images on Startup InterfaceVpcEndpointAwsService.ECR, // List Images on Startup InterfaceVpcEndpointAwsService.ECS_AGENT, // Task Container Metrics diff --git a/deployment/cdk/opensearch-service-migration/lib/service-stacks/migration-service-core.ts b/deployment/cdk/opensearch-service-migration/lib/service-stacks/migration-service-core.ts index 0ae4bc223..da3744782 100644 --- a/deployment/cdk/opensearch-service-migration/lib/service-stacks/migration-service-core.ts +++ b/deployment/cdk/opensearch-service-migration/lib/service-stacks/migration-service-core.ts @@ -13,6 +13,7 @@ import { Volume, AwsLogDriverMode, ContainerDependencyCondition, + ServiceManagedVolume } from "aws-cdk-lib/aws-ecs"; import {DockerImageAsset} from "aws-cdk-lib/aws-ecr-assets"; import {Duration, RemovalPolicy, Stack} from "aws-cdk-lib"; @@ -186,6 +187,11 @@ export class MigrationServiceCore extends Stack { vpcSubnets: props.vpc.selectSubnets({subnetType: SubnetType.PRIVATE_WITH_EGRESS}), }); + // Add any ServiceManagedVolumes to the service, if they exist + if (props.volumes) { + props.volumes.filter(vol => vol instanceof ServiceManagedVolume).forEach(vol => fargateService.addVolume(vol as ServiceManagedVolume)); + } + if (props.targetGroups) { props.targetGroups.filter(tg => tg !== undefined).forEach(tg => tg.addTarget(fargateService)); } diff --git a/deployment/cdk/opensearch-service-migration/lib/service-stacks/reindex-from-snapshot-stack.ts b/deployment/cdk/opensearch-service-migration/lib/service-stacks/reindex-from-snapshot-stack.ts index 66525ec04..180f9e8a5 100644 --- a/deployment/cdk/opensearch-service-migration/lib/service-stacks/reindex-from-snapshot-stack.ts +++ b/deployment/cdk/opensearch-service-migration/lib/service-stacks/reindex-from-snapshot-stack.ts @@ -1,6 +1,12 @@ import {StackPropsExt} from "../stack-composer"; -import {IVpc, SecurityGroup} from "aws-cdk-lib/aws-ec2"; -import {CpuArchitecture} from "aws-cdk-lib/aws-ecs"; +import {Size} from "aws-cdk-lib/core"; +import {IVpc, SecurityGroup, EbsDeviceVolumeType} from "aws-cdk-lib/aws-ec2"; +import { + CpuArchitecture, + ServiceManagedVolume, + FileSystemType, + EbsPropagatedTagSource +} from "aws-cdk-lib/aws-ecs"; import {Construct} from "constructs"; import {join} from "path"; import {MigrationServiceCore} from "./migration-service-core"; @@ -24,7 +30,9 @@ export interface ReindexFromSnapshotProps extends StackPropsExt { readonly extraArgs?: string, readonly otelCollectorEnabled: boolean, readonly clusterAuthDetails: ClusterAuth - readonly sourceClusterVersion?: string + readonly sourceClusterVersion?: string, + readonly maxShardSizeGiB?: number + } export class ReindexFromSnapshotStack extends MigrationServiceCore { @@ -70,12 +78,16 @@ export class ReindexFromSnapshotStack extends MigrationServiceCore { const s3Uri = `s3://migration-artifacts-${this.account}-${props.stage}-${this.region}/rfs-snapshot-repo`; let command = "/rfs-app/runJavaWithClasspath.sh org.opensearch.migrations.RfsMigrateDocuments" const extraArgsDict = parseArgsToDict(props.extraArgs) - command = appendArgIfNotInExtraArgs(command, extraArgsDict, "--s3-local-dir", "/tmp/s3_files") + const storagePath = "/storage" + const planningSize = props.maxShardSizeGiB ?? 80; + const maxShardSizeBytes = `${planningSize * 1024 * 1024 * 1024}` + command = appendArgIfNotInExtraArgs(command, extraArgsDict, "--s3-local-dir", `"${storagePath}/s3_files"`) command = appendArgIfNotInExtraArgs(command, extraArgsDict, "--s3-repo-uri", `"${s3Uri}"`) command = appendArgIfNotInExtraArgs(command, extraArgsDict, "--s3-region", this.region) command = appendArgIfNotInExtraArgs(command, extraArgsDict, "--snapshot-name", "rfs-snapshot") - command = appendArgIfNotInExtraArgs(command, extraArgsDict, "--lucene-dir", "/lucene") + command = appendArgIfNotInExtraArgs(command, extraArgsDict, "--lucene-dir", `"${storagePath}/lucene"`) command = appendArgIfNotInExtraArgs(command, extraArgsDict, "--target-host", osClusterEndpoint) + command = appendArgIfNotInExtraArgs(command, extraArgsDict, "--max-shard-size-bytes", maxShardSizeBytes) if (props.clusterAuthDetails.sigv4) { command = appendArgIfNotInExtraArgs(command, extraArgsDict, "--target-aws-service-signing-name", props.clusterAuthDetails.sigv4.serviceSigningName) command = appendArgIfNotInExtraArgs(command, extraArgsDict, "--target-aws-region", props.clusterAuthDetails.sigv4.region) @@ -113,19 +125,57 @@ export class ReindexFromSnapshotStack extends MigrationServiceCore { servicePolicies.push(getSecretsPolicy); } + const volumes = [sharedLogFileSystem.asVolume()]; + const mountPoints = [sharedLogFileSystem.asMountPoint()]; + + // Calculate the volume size based on the max shard size + // Have space for the snapshot and an unpacked copy, with buffer + const volumeSize = Math.max( + Math.ceil(planningSize * 2 * 1.15), + 1 + ) + + if (volumeSize > 16000) { + // 16 TiB is the maximum volume size for GP3 + throw new Error(`"Your max shard size of ${props.maxShardSizeGiB} GiB is too large to migrate."`) + } + + // Volume we'll use to download and unpack the snapshot + const snapshotVolume = new ServiceManagedVolume(this, 'SnapshotVolume', { + name: 'snapshot-volume', + managedEBSVolume: { + size: Size.gibibytes(volumeSize), + volumeType: EbsDeviceVolumeType.GP3, + fileSystemType: FileSystemType.XFS, + tagSpecifications: [{ + tags: { + Name: `rfs-snapshot-volume-${props.stage}`, + }, + propagateTags: EbsPropagatedTagSource.SERVICE, + }], + encrypted: true, + }, + }); + + volumes.push(snapshotVolume); + mountPoints.push({ + containerPath: storagePath, + readOnly: false, + sourceVolume: snapshotVolume.name, + }); + this.createService({ serviceName: 'reindex-from-snapshot', taskInstanceCount: 0, dockerDirectoryPath: join(__dirname, "../../../../../", "DocumentsFromSnapshotMigration/docker"), dockerImageCommand: ['/bin/sh', '-c', "/rfs-app/entrypoint.sh"], securityGroups: securityGroups, - volumes: [sharedLogFileSystem.asVolume()], - mountPoints: [sharedLogFileSystem.asMountPoint()], + volumes: volumes, + mountPoints: mountPoints, taskRolePolicies: servicePolicies, cpuArchitecture: props.fargateCpuArch, taskCpuUnits: 2048, taskMemoryLimitMiB: 4096, - ephemeralStorageGiB: 200, environment: { "RFS_COMMAND": command, "RFS_TARGET_USER": targetUser, diff --git a/deployment/cdk/opensearch-service-migration/lib/stack-composer.ts b/deployment/cdk/opensearch-service-migration/lib/stack-composer.ts index 9f52bc8a8..081ac969d 100644 --- a/deployment/cdk/opensearch-service-migration/lib/stack-composer.ts +++ b/deployment/cdk/opensearch-service-migration/lib/stack-composer.ts @@ -210,6 +210,7 @@ export class StackComposer { const otelCollectorEnabled = this.getContextForType('otelCollectorEnabled', 'boolean', defaultValues, contextJSON) const reindexFromSnapshotServiceEnabled = this.getContextForType('reindexFromSnapshotServiceEnabled', 'boolean', defaultValues, contextJSON) const reindexFromSnapshotExtraArgs = this.getContextForType('reindexFromSnapshotExtraArgs', 'string', defaultValues, contextJSON) + const reindexFromSnapshotMaxShardSizeGiB = this.getContextForType('reindexFromSnapshotMaxShardSizeGiB', 'number', defaultValues, contextJSON) const albAcmCertArn = this.getContextForType('albAcmCertArn', 'string', defaultValues, contextJSON); const managedServiceSourceSnapshotEnabled = this.getContextForType('managedServiceSourceSnapshotEnabled', 'boolean', defaultValues, contextJSON) @@ -483,7 +484,8 @@ export class StackComposer { otelCollectorEnabled: otelCollectorEnabled, defaultDeployId: defaultDeployId, fargateCpuArch: fargateCpuArch, - env: props.env + env: props.env, + maxShardSizeGiB: reindexFromSnapshotMaxShardSizeGiB }) this.addDependentStacks(reindexFromSnapshotStack, [migrationStack, openSearchStack, osContainerStack]) this.stacks.push(reindexFromSnapshotStack) diff --git a/deployment/cdk/opensearch-service-migration/options.md b/deployment/cdk/opensearch-service-migration/options.md index 8186f2a38..34a3e8949 100644 --- a/deployment/cdk/opensearch-service-migration/options.md +++ b/deployment/cdk/opensearch-service-migration/options.md @@ -60,6 +60,7 @@ The optional component is: | reindexFromSnapshotExtraArgs | string | "--target-aws-region us-east-1 --target-aws-service-signing-name es" | Extra arguments to provide to the Document Migration command with space separation. See [RFS Arguments](../../../DocumentsFromSnapshotMigration/README.md#Arguments). [^1] | | sourceClusterEndpoint | string | `"https://source-cluster.elb.us-east-1.endpoint.com"` | The endpoint for the source cluster from which RFS will take a snapshot | | managedServiceSourceSnapshotEnabled | boolean | true | Create the necessary roles and trust relationships to take a snapshot of a managed service source cluster. This is only compatible with SigV4 auth. | +| reindexFromSnapshotMaxShardSizeGiB | integer | 80 | OPTIONAL: The size, in whole GiB, of the largest shard you want to migrate across all indices; used to ensure we have enough disk space reserved to perform the migration. Default: 80 GiB | ### VPC Options diff --git a/deployment/cdk/opensearch-service-migration/test/reindex-from-snapshot-stack.test.ts b/deployment/cdk/opensearch-service-migration/test/reindex-from-snapshot-stack.test.ts index 273892b67..4f74b2ba7 100644 --- a/deployment/cdk/opensearch-service-migration/test/reindex-from-snapshot-stack.test.ts +++ b/deployment/cdk/opensearch-service-migration/test/reindex-from-snapshot-stack.test.ts @@ -115,10 +115,11 @@ describe('ReindexFromSnapshotStack Tests', () => { Value: { "Fn::Join": [ "", - [ "/rfs-app/runJavaWithClasspath.sh org.opensearch.migrations.RfsMigrateDocuments --s3-local-dir /tmp/s3_files --s3-repo-uri \"s3://migration-artifacts-test-account-unit-test-us-east-1/rfs-snapshot-repo\" --s3-region us-east-1 --snapshot-name rfs-snapshot --lucene-dir /lucene --target-host ", + [ "/rfs-app/runJavaWithClasspath.sh org.opensearch.migrations.RfsMigrateDocuments --s3-local-dir \"/storage/s3_files\" --s3-repo-uri \"s3://migration-artifacts-test-account-unit-test-us-east-1/rfs-snapshot-repo\" --s3-region us-east-1 --snapshot-name rfs-snapshot --lucene-dir \"/storage/lucene\" --target-host ", { "Ref": "SsmParameterValuemigrationunittestdefaultosClusterEndpointC96584B6F00A464EAD1953AFF4B05118Parameter", }, + " --max-shard-size-bytes 85899345920" ], ], } @@ -182,11 +183,11 @@ describe('ReindexFromSnapshotStack Tests', () => { Value: { "Fn::Join": [ "", - [ "/rfs-app/runJavaWithClasspath.sh org.opensearch.migrations.RfsMigrateDocuments --s3-local-dir /tmp/s3_files --s3-repo-uri \"s3://migration-artifacts-test-account-unit-test-us-east-1/rfs-snapshot-repo\" --s3-region us-east-1 --snapshot-name rfs-snapshot --lucene-dir /lucene --target-host ", + [ "/rfs-app/runJavaWithClasspath.sh org.opensearch.migrations.RfsMigrateDocuments --s3-local-dir \"/storage/s3_files\" --s3-repo-uri \"s3://migration-artifacts-test-account-unit-test-us-east-1/rfs-snapshot-repo\" --s3-region us-east-1 --snapshot-name rfs-snapshot --lucene-dir \"/storage/lucene\" --target-host ", { "Ref": "SsmParameterValuemigrationunittestdefaultosClusterEndpointC96584B6F00A464EAD1953AFF4B05118Parameter", }, - " --target-aws-service-signing-name aoss --target-aws-region eu-west-1", + " --max-shard-size-bytes 85899345920 --target-aws-service-signing-name aoss --target-aws-region eu-west-1", ], ], } @@ -271,11 +272,11 @@ describe('ReindexFromSnapshotStack Tests', () => { Value: { "Fn::Join": [ "", - [ "/rfs-app/runJavaWithClasspath.sh org.opensearch.migrations.RfsMigrateDocuments --s3-local-dir /tmp/s3_files --s3-repo-uri \"s3://migration-artifacts-test-account-unit-test-us-east-1/rfs-snapshot-repo\" --s3-region us-east-1 --lucene-dir /lucene --target-host ", + [ "/rfs-app/runJavaWithClasspath.sh org.opensearch.migrations.RfsMigrateDocuments --s3-local-dir \"/storage/s3_files\" --s3-repo-uri \"s3://migration-artifacts-test-account-unit-test-us-east-1/rfs-snapshot-repo\" --s3-region us-east-1 --lucene-dir \"/storage/lucene\" --target-host ", { "Ref": "SsmParameterValuemigrationunittestdefaultosClusterEndpointC96584B6F00A464EAD1953AFF4B05118Parameter", }, - " --custom-arg value --flag --snapshot-name \"custom-snapshot\"" + " --max-shard-size-bytes 85899345920 --custom-arg value --flag --snapshot-name \"custom-snapshot\"" ] ] }