feat: add failover to on-demand in case request is failing (#3409)

Adding the option to create on-demand instances in case spot is failing. ## Problem This module either support spot or on-demand instances. When using spot (default), creation can fail with several reasons. In case there is no sufficient capacity on AWS it makes sens to request on-demand instances. AWS does not support this kind of requests via the fleet API. This PR addresses this problme and add the option (opt-in_ to create on-demand instances in case of Insufficient capacity. ## Migrations No migrations required ## Opt-in Opt in by setting the variable`enable_runner_on_demand_failover` ## Verfication Not easy to test the failover. But due to changes in multi-runner, runner module as well lambda scale-up and pool. The following checks are required - [x] Ephemeral example with pool - [x] Multi runner example --------- Co-authored-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com>
philips-labs · Nov 29, 2023 · d71e631 · d71e631
1 parent a949ead
commit d71e631
Show file tree

Hide file tree

Showing 23 changed files with 384 additions and 112 deletions.
diff --git a/README.md b/README.md
@@ -532,6 +532,7 @@ We welcome any improvement to the standard module to make the default as secure
 | <a name="input_enable_organization_runners"></a> [enable\_organization\_runners](#input\_enable\_organization\_runners) | Register runners to organization, instead of repo level | `bool` | `false` | no |
 | <a name="input_enable_runner_binaries_syncer"></a> [enable\_runner\_binaries\_syncer](#input\_enable\_runner\_binaries\_syncer) | Option to disable the lambda to sync GitHub runner distribution, useful when using a pre-build AMI. | `bool` | `true` | no |
 | <a name="input_enable_runner_detailed_monitoring"></a> [enable\_runner\_detailed\_monitoring](#input\_enable\_runner\_detailed\_monitoring) | Should detailed monitoring be enabled for the runner. Set this to true if you want to use detailed monitoring. See https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/using-cloudwatch-new.html for details. | `bool` | `false` | no |
+| <a name="input_enable_runner_on_demand_failover_for_errors"></a> [enable\_runner\_on\_demand\_failover\_for\_errors](#input\_enable\_runner\_on\_demand\_failover\_for\_errors) | Enable on-demand failover. For example to fall back to on demand when no spot capacity is available the variable can be set to `InsufficientInstanceCapacity`. When not defined the default behavior is to retry later. | `list(string)` | `[]` | no |
 | <a name="input_enable_runner_workflow_job_labels_check_all"></a> [enable\_runner\_workflow\_job\_labels\_check\_all](#input\_enable\_runner\_workflow\_job\_labels\_check\_all) | If set to true all labels in the workflow job must match the GitHub labels (os, architecture and `self-hosted`). When false if __any__ label matches it will trigger the webhook. | `bool` | `true` | no |
 | <a name="input_enable_ssm_on_runners"></a> [enable\_ssm\_on\_runners](#input\_enable\_ssm\_on\_runners) | Enable to allow access to the runner instances for debugging purposes via SSM. Note that this adds additional permissions to the runner instances. | `bool` | `false` | no |
 | <a name="input_enable_user_data_debug_logging_runner"></a> [enable\_user\_data\_debug\_logging\_runner](#input\_enable\_user\_data\_debug\_logging\_runner) | Option to enable debug logging for user-data, this logs all secrets as well. | `bool` | `false` | no |

diff --git a/examples/multi-runner/templates/runner-configs/linux-x64.yaml b/examples/multi-runner/templates/runner-configs/linux-x64.yaml
@@ -15,6 +15,7 @@ runner_config:
     - m5a.large
   runners_maximum_count: 1
   enable_ephemeral_runners: true
+  enable_on_demand_failover_for_errors: ['InsufficientInstanceCapacity']
   create_service_linked_role_spot: true
   scale_down_schedule_expression: cron(* * * * ? *)
   runner_metadata_options:

diff --git a/lambdas/functions/control-plane/jest.config.ts b/lambdas/functions/control-plane/jest.config.ts
@@ -6,10 +6,10 @@ const config: Config = {
   ...defaultConfig,
   coverageThreshold: {
     global: {
-      statements: 97.89,
-      branches: 94.64,
-      functions: 97.33,
-      lines: 98.21,
+      statements: 97.99,
+      branches: 96.04,
+      functions: 97.53,
+      lines: 98.3,
     },
   },
 };

diff --git a/lambdas/functions/control-plane/src/aws/runners.d.ts b/lambdas/functions/control-plane/src/aws/runners.d.ts
@@ -37,7 +37,8 @@ export interface RunnerInputParameters {
     maxSpotPrice?: string;
     instanceAllocationStrategy: SpotAllocationStrategy;
   };
-  numberOfRunners?: number;
+  numberOfRunners: number;
   amiIdSsmParameterName?: string;
   tracingEnabled?: boolean;
+  onDemandFailoverOnError?: string[];
 }
diff --git a/lambdas/functions/control-plane/src/aws/runners.test.ts b/lambdas/functions/control-plane/src/aws/runners.test.ts
@@ -1,6 +1,7 @@
 import {
   CreateFleetCommand,
   CreateFleetCommandInput,
+  CreateFleetInstance,
   CreateFleetResult,
   DefaultTargetCapacityType,
   DescribeInstancesCommand,
@@ -140,6 +141,28 @@ describe('list instances', () => {
     const resp = await listEC2Runners();
     expect(resp.length).toBe(1);
   });
+
+  it('Filter instances for state running.', async () => {
+    mockEC2Client.on(DescribeInstancesCommand).resolves(mockRunningInstances);
+    await listEC2Runners({ statuses: ['running'] });
+    expect(mockEC2Client).toHaveReceivedCommandWith(DescribeInstancesCommand, {
+      Filters: [
+        { Name: 'instance-state-name', Values: ['running'] },
+        { Name: 'tag:ghr:Application', Values: ['github-action-runner'] },
+      ],
+    });
+  });
+
+  it('Filter instances with status undefined, fall back to defaults.', async () => {
+    mockEC2Client.on(DescribeInstancesCommand).resolves(mockRunningInstances);
+    await listEC2Runners({ statuses: undefined });
+    expect(mockEC2Client).toHaveReceivedCommandWith(DescribeInstancesCommand, {
+      Filters: [
+        { Name: 'instance-state-name', Values: ['running', 'pending'] },
+        { Name: 'tag:ghr:Application', Values: ['github-action-runner'] },
+      ],
+    });
+  });
 });
 
 describe('terminate runner', () => {
@@ -178,7 +201,6 @@ describe('create runner', () => {
     mockEC2Client.reset();
     mockSSMClient.reset();
 
-    //mockEC2.createFleet.mockImplementation(() => mockCreateFleet);
     mockEC2Client.on(CreateFleetCommand).resolves({ Instances: [{ InstanceIds: ['i-1234'] }] });
     mockSSMClient.on(GetParameterCommand).resolves({});
   });
@@ -333,6 +355,130 @@ describe('create runner with errors', () => {
     expect(mockEC2Client).not.toHaveReceivedCommand(CreateFleetCommand);
     expect(mockSSMClient).not.toHaveReceivedCommand(PutParameterCommand);
   });
+
+  it('Error with undefined Instances and Errors.', async () => {
+    mockEC2Client.on(CreateFleetCommand).resolvesOnce({ Instances: undefined, Errors: undefined });
+    await expect(createRunner(createRunnerConfig(defaultRunnerConfig))).rejects.toBeInstanceOf(Error);
+  });
+
+  it('Error with undefined InstanceIds and ErrorCode.', async () => {
+    mockEC2Client.on(CreateFleetCommand).resolvesOnce({
+      Instances: [{ InstanceIds: undefined }],
+      Errors: [
+        {
+          ErrorCode: undefined,
+        },
+      ],
+    });
+    await expect(createRunner(createRunnerConfig(defaultRunnerConfig))).rejects.toBeInstanceOf(Error);
+  });
+});
+
+describe('create runner with errors fail over to OnDemand', () => {
+  const defaultRunnerConfig: RunnerConfig = {
+    allocationStrategy: SpotAllocationStrategy.CAPACITY_OPTIMIZED,
+    capacityType: 'spot',
+    type: 'Repo',
+    onDemandFailoverOnError: ['InsufficientInstanceCapacity'],
+  };
+  const defaultExpectedFleetRequestValues: ExpectedFleetRequestValues = {
+    type: 'Repo',
+    capacityType: 'spot',
+    allocationStrategy: SpotAllocationStrategy.CAPACITY_OPTIMIZED,
+    totalTargetCapacity: 1,
+  };
+  beforeEach(() => {
+    jest.clearAllMocks();
+    mockEC2Client.reset();
+    mockSSMClient.reset();
+
+    mockSSMClient.on(PutParameterCommand).resolves({});
+    mockSSMClient.on(GetParameterCommand).resolves({});
+    mockEC2Client.on(CreateFleetCommand).resolves({ Instances: [] });
+  });
+
+  it('test InsufficientInstanceCapacity fallback to on demand .', async () => {
+    const instancesIds = ['i-123'];
+    createFleetMockWithWithOnDemandFallback(['InsufficientInstanceCapacity'], instancesIds);
+
+    const instancesResult = await createRunner(createRunnerConfig(defaultRunnerConfig));
+    expect(instancesResult).toEqual(instancesIds);
+
+    expect(mockEC2Client).toHaveReceivedCommandTimes(CreateFleetCommand, 2);
+
+    // first call with spot failuer
+    expect(mockEC2Client).toHaveReceivedNthCommandWith(1, CreateFleetCommand, {
+      ...expectedCreateFleetRequest({
+        ...defaultExpectedFleetRequestValues,
+        totalTargetCapacity: 1,
+        capacityType: 'spot',
+      }),
+    });
+
+    // second call with with OnDemand failback
+    expect(mockEC2Client).toHaveReceivedNthCommandWith(2, CreateFleetCommand, {
+      ...expectedCreateFleetRequest({
+        ...defaultExpectedFleetRequestValues,
+        totalTargetCapacity: 1,
+        capacityType: 'on-demand',
+      }),
+    });
+  });
+
+  it('test InsufficientInstanceCapacity no failback.', async () => {
+    await expect(
+      createRunner(createRunnerConfig({ ...defaultRunnerConfig, onDemandFailoverOnError: [] })),
+    ).rejects.toBeInstanceOf(Error);
+  });
+
+  it('test InsufficientInstanceCapacity with mutlipte instances and fallback to on demand .', async () => {
+    const instancesIds = ['i-123', 'i-456'];
+    createFleetMockWithWithOnDemandFallback(['InsufficientInstanceCapacity'], instancesIds);
+
+    const instancesResult = await createRunner({ ...createRunnerConfig(defaultRunnerConfig), numberOfRunners: 2 });
+    expect(instancesResult).toEqual(instancesIds);
+
+    expect(mockEC2Client).toHaveReceivedCommandTimes(CreateFleetCommand, 2);
+
+    // first call with spot failuer
+    expect(mockEC2Client).toHaveReceivedNthCommandWith(1, CreateFleetCommand, {
+      ...expectedCreateFleetRequest({
+        ...defaultExpectedFleetRequestValues,
+        totalTargetCapacity: 2,
+        capacityType: 'spot',
+      }),
+    });
+
+    // second call with with OnDemand failback, capacity is reduced by 1
+    expect(mockEC2Client).toHaveReceivedNthCommandWith(2, CreateFleetCommand, {
+      ...expectedCreateFleetRequest({
+        ...defaultExpectedFleetRequestValues,
+        totalTargetCapacity: 1,
+        capacityType: 'on-demand',
+      }),
+    });
+  });
+
+  it('test UnfulfillableCapacity with mutlipte instances and no fallback to on demand .', async () => {
+    const instancesIds = ['i-123', 'i-456'];
+    // fallback to on demand for UnfulfillableCapacity but InsufficientInstanceCapacity is thrown
+    createFleetMockWithWithOnDemandFallback(['UnfulfillableCapacity'], instancesIds);
+
+    await expect(
+      createRunner({ ...createRunnerConfig(defaultRunnerConfig), numberOfRunners: 2 }),
+    ).rejects.toBeInstanceOf(Error);
+
+    expect(mockEC2Client).toHaveReceivedCommandTimes(CreateFleetCommand, 1);
+
+    // first call with spot failuer
+    expect(mockEC2Client).toHaveReceivedNthCommandWith(1, CreateFleetCommand, {
+      ...expectedCreateFleetRequest({
+        ...defaultExpectedFleetRequestValues,
+        totalTargetCapacity: 2,
+        capacityType: 'spot',
+      }),
+    });
+  });
 });
 
 function createFleetMockWithErrors(errors: string[], instances?: string[]) {
@@ -354,20 +500,37 @@ function createFleetMockWithErrors(errors: string[], instances?: string[]) {
   mockEC2Client.on(CreateFleetCommand).resolves(result);
 }
 
+function createFleetMockWithWithOnDemandFallback(errors: string[], instances?: string[], numberOfFailures = 1) {
+  const instanceesFirstCall: CreateFleetInstance = {
+    InstanceIds: instances?.slice(0, instances.length - numberOfFailures).map((i) => i),
+  };
+
+  const instancesSecondCall: CreateFleetInstance = {
+    InstanceIds: instances?.slice(instances.length - numberOfFailures, instances.length).map((i) => i),
+  };
+
+  mockEC2Client
+    .on(CreateFleetCommand)
+    .resolvesOnce({ Instances: [instanceesFirstCall], Errors: errors.map((e) => ({ ErrorCode: e })) })
+    .resolvesOnce({ Instances: [instancesSecondCall] });
+}
+
 interface RunnerConfig {
   type: RunnerType;
   capacityType: DefaultTargetCapacityType;
   allocationStrategy: SpotAllocationStrategy;
   maxSpotPrice?: string;
   amiIdSsmParameterName?: string;
   tracingEnabled?: boolean;
+  onDemandFailoverOnError?: string[];
 }
 
 function createRunnerConfig(runnerConfig: RunnerConfig): RunnerInputParameters {
   return {
     environment: ENVIRONMENT,
     runnerType: runnerConfig.type,
     runnerOwner: REPO_NAME,
+    numberOfRunners: 1,
     launchTemplateName: LAUNCH_TEMPLATE,
     ec2instanceCriteria: {
       instanceTypes: ['m5.large', 'c5.large'],
@@ -378,6 +541,7 @@ function createRunnerConfig(runnerConfig: RunnerConfig): RunnerInputParameters {
     subnets: ['subnet-123', 'subnet-456'],
     amiIdSsmParameterName: runnerConfig.amiIdSsmParameterName,
     tracingEnabled: runnerConfig.tracingEnabled,
+    onDemandFailoverOnError: runnerConfig.onDemandFailoverOnError,
   };
 }
 
@@ -451,7 +615,7 @@ function expectedCreateFleetRequest(expectedValues: ExpectedFleetRequestValues):
   };
 
   if (expectedValues.imageId) {
-    for (const config of request?.LaunchTemplateConfigs || []) {
+    for (const config of request?.LaunchTemplateConfigs ?? []) {
       if (config.Overrides) {
         for (const override of config.Overrides) {
           override.ImageId = expectedValues.imageId;