Skip to content

Commit

Permalink
fix: Honnor booting instance in runner pool (#2801)
Browse files Browse the repository at this point in the history
Since the pool was not respecting booting instances it could happen you end up with a too large pool. Applying this change could mean that you need a larger pool than before.

Co-authored-by: Michael Poutre <m1kep.my.mail@gmail.com>
Co-authored-by: Ryan McKern <ryan.mckern@segment.com>
  • Loading branch information
3 people authored Jan 12, 2023
1 parent fca2ddf commit 9f841f7
Show file tree
Hide file tree
Showing 5 changed files with 161 additions and 57 deletions.
156 changes: 112 additions & 44 deletions modules/runners/lambdas/runners/src/pool/pool.test.ts
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
import { Octokit } from '@octokit/rest';
import { mocked } from 'jest-mock';
import moment from 'moment-timezone';
import nock from 'nock';

import { listEC2Runners } from '../aws/runners';
import * as ghAuth from '../gh-auth/gh-auth';
import * as scale from '../scale-runners/scale-up';
import { createRunners } from '../scale-runners/scale-up';
import { adjust } from './pool';

const mockOctokit = {
Expand All @@ -24,6 +25,7 @@ jest.mock('@octokit/rest', () => ({

jest.mock('./../aws/runners');
jest.mock('./../gh-auth/gh-auth');
jest.mock('./../scale-runners/scale-up');

const mocktokit = Octokit as jest.MockedClass<typeof Octokit>;
const mockedAppAuth = mocked(ghAuth.createGithubAppAuth, {
Expand All @@ -36,6 +38,36 @@ const mockListRunners = mocked(listEC2Runners);
const cleanEnv = process.env;

const ORG = 'my-org';
const MINIMUM_TIME_RUNNING = 15;

const ec2InstancesRegistered = [
{
instanceId: 'i-1-idle',
launchTime: new Date(),
type: 'Org',
owner: ORG,
},
{
instanceId: 'i-2-busy',
launchTime: new Date(),
type: 'Org',
owner: ORG,
},
{
instanceId: 'i-3-offline',
launchTime: new Date(),
type: 'Org',
owner: ORG,
},
{
instanceId: 'i-4-idle-older-than-minimum-time-running',
launchTime: moment(new Date())
.subtract(MINIMUM_TIME_RUNNING + 3, 'minutes')
.toDate(),
type: 'Org',
owner: ORG,
},
];

beforeEach(() => {
nock.disableNetConnect();
Expand All @@ -55,6 +87,7 @@ beforeEach(() => {
process.env.INSTANCE_TYPES = 'm5.large';
process.env.INSTANCE_TARGET_CAPACITY_TYPE = 'spot';
process.env.RUNNER_OWNER = ORG;
process.env.RUNNER_BOOT_TIME_IN_MINUTES = MINIMUM_TIME_RUNNING.toString();

const mockTokenReturnValue = {
data: {
Expand All @@ -66,66 +99,39 @@ beforeEach(() => {
mockOctokit.paginate.mockImplementation(() => [
{
id: 1,
name: 'i-1',
name: 'i-1-idle',
os: 'linux',
status: 'online',
busy: false,
labels: [],
},
{
id: 2,
name: 'i-2',
name: 'i-2-busy',
os: 'linux',
status: 'online',
busy: true,
labels: [],
},
{
id: 3,
name: 'i-3',
name: 'i-3-offline',
os: 'linux',
status: 'offline',
busy: false,
labels: [],
},
{
id: 11,
name: 'j-1', // some runner of another env
id: 3,
name: 'i-4-idle-older-than-minimum-time-running',
os: 'linux',
status: 'online',
busy: false,
labels: [],
},
{
id: 12,
name: 'j-2', // some runner of another env
os: 'linux',
status: 'online',
busy: true,
labels: [],
},
]);

mockListRunners.mockImplementation(async () => [
{
instanceId: 'i-1',
launchTime: new Date(),
type: 'Org',
owner: ORG,
},
{
instanceId: 'i-2',
launchTime: new Date(),
type: 'Org',
owner: ORG,
},
{
instanceId: 'i-3',
launchTime: new Date(),
type: 'Org',
owner: ORG,
},
]);
mockListRunners.mockImplementation(async () => ec2InstancesRegistered);

const mockInstallationIdReturnValueOrgs = {
data: {
Expand Down Expand Up @@ -156,16 +162,74 @@ beforeEach(() => {

describe('Test simple pool.', () => {
describe('With GitHub Cloud', () => {
it('Top up pool with pool size 2.', async () => {
const spy = jest.spyOn(scale, 'createRunners');
await expect(adjust({ poolSize: 2 })).resolves;
expect(spy).toBeCalled;
it('Top up pool with pool size 2 registered.', async () => {
await expect(await adjust({ poolSize: 3 })).resolves;
expect(createRunners).toHaveBeenCalledTimes(1);
expect(createRunners).toHaveBeenCalledWith(
expect.anything(),
expect.objectContaining({ numberOfRunners: 1 }),
expect.anything(),
);
});

it('Should not top up if pool size is reached.', async () => {
const spy = jest.spyOn(scale, 'createRunners');
await expect(adjust({ poolSize: 1 })).resolves;
expect(spy).not.toHaveBeenCalled;
await expect(await adjust({ poolSize: 1 })).resolves;
expect(createRunners).not.toHaveBeenCalled();
});

it('Should top up if pool size is not reached including a booting instance.', async () => {
mockListRunners.mockImplementation(async () => [
...ec2InstancesRegistered,
{
instanceId: 'i-4-still-booting',
launchTime: moment(new Date())
.subtract(MINIMUM_TIME_RUNNING - 3, 'minutes')
.toDate(),
type: 'Org',
owner: ORG,
},
{
instanceId: 'i-5-orphan',
launchTime: moment(new Date())
.subtract(MINIMUM_TIME_RUNNING + 3, 'minutes')
.toDate(),
type: 'Org',
owner: ORG,
},
]);

// 2 idle + 1 booting = 3, top up with 2 to match a pool of 5
await expect(await adjust({ poolSize: 5 })).resolves;
expect(createRunners).toHaveBeenCalledWith(
expect.anything(),
expect.objectContaining({ numberOfRunners: 2 }),
expect.anything(),
);
});

it('Should not top up if pool size is reached including a booting instance.', async () => {
mockListRunners.mockImplementation(async () => [
...ec2InstancesRegistered,
{
instanceId: 'i-4-still-booting',
launchTime: moment(new Date())
.subtract(MINIMUM_TIME_RUNNING - 3, 'minutes')
.toDate(),
type: 'Org',
owner: ORG,
},
{
instanceId: 'i-5-orphan',
launchTime: moment(new Date())
.subtract(MINIMUM_TIME_RUNNING + 3, 'minutes')
.toDate(),
type: 'Org',
owner: ORG,
},
]);

await expect(await adjust({ poolSize: 2 })).resolves;
expect(createRunners).not.toHaveBeenCalled();
});
});

Expand All @@ -175,9 +239,13 @@ describe('Test simple pool.', () => {
});

it('Top up if the pool size is set to 5', async () => {
const spy = jest.spyOn(scale, 'createRunners');
await expect(adjust({ poolSize: 5 })).resolves;
expect(spy).toBeCalled;
await expect(await adjust({ poolSize: 5 })).resolves;
// 2 idle, top up with 3 to match a pool of 5
expect(createRunners).toHaveBeenCalledWith(
expect.anything(),
expect.objectContaining({ numberOfRunners: 3 }),
expect.anything(),
);
});
});
});
59 changes: 46 additions & 13 deletions modules/runners/lambdas/runners/src/pool/pool.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import moment from 'moment';
import yn from 'yn';

import { listEC2Runners } from '../aws/runners';
import { RunnerList, listEC2Runners } from '../aws/runners';
import { createGithubAppAuth, createGithubInstallationAuth, createOctoClient } from '../gh-auth/gh-auth';
import { logger as rootLogger } from '../logger';
import { createRunners } from '../scale-runners/scale-up';
Expand All @@ -11,6 +12,11 @@ export interface PoolEvent {
poolSize: number;
}

interface RunnerStatus {
busy: boolean;
status: string;
}

export async function adjust(event: PoolEvent): Promise<void> {
logger.info(`Checking current pool size against pool of size: ${event.poolSize}`);
const runnerExtraLabels = process.env.RUNNER_EXTRA_LABELS;
Expand Down Expand Up @@ -45,20 +51,41 @@ export async function adjust(event: PoolEvent): Promise<void> {
per_page: 100,
},
);
const idleRunners = runners.filter((r) => !r.busy && r.status === 'online').map((r) => r.name);
const runnerStatus = new Map<string, RunnerStatus>();
for (const runner of runners) {
runnerStatus.set(runner.name, { busy: runner.busy, status: runner.status });
}

// Look up the managed ec2 runners in AWS, but running does not mean idle
const ec2runners = (
await listEC2Runners({
environment,
runnerOwner,
runnerType: 'Org',
statuses: ['running'],
})
).map((r) => r.instanceId);
const ec2runners = await listEC2Runners({
environment,
runnerOwner,
runnerType: 'Org',
statuses: ['running'],
});

const managedIdleRunners = ec2runners.filter((r) => idleRunners.includes(r));
const topUp = event.poolSize - managedIdleRunners.length;
// Runner should be considered idle if it is still booting, or is idle in GitHub
let numberOfRunnersInPool = 0;
for (const ec2Instance of ec2runners) {
if (
runnerStatus.get(ec2Instance.instanceId)?.busy === false &&
runnerStatus.get(ec2Instance.instanceId)?.status === 'online'
) {
numberOfRunnersInPool++;
logger.debug(`Runner ${ec2Instance.instanceId} is idle in GitHub and counted as part of the pool`);
} else if (runnerStatus.get(ec2Instance.instanceId) != null) {
logger.debug(`Runner ${ec2Instance.instanceId} is not idle in GitHub and NOT counted as part of the pool`);
} else if (!bootTimeExceeded(ec2Instance)) {
numberOfRunnersInPool++;
logger.info(`Runner ${ec2Instance.instanceId} is still booting and counted as part of the pool`);
} else {
logger.debug(
`Runner ${ec2Instance.instanceId} is not idle in GitHub nor booting and not counted as part of the pool`,
);
}
}

const topUp = event.poolSize - numberOfRunnersInPool;
if (topUp > 0) {
logger.info(`The pool will be topped up with ${topUp} runners.`);
await createRunners(
Expand Down Expand Up @@ -87,7 +114,7 @@ export async function adjust(event: PoolEvent): Promise<void> {
githubInstallationClient,
);
} else {
logger.info(`Pool will not be topped up. Find ${managedIdleRunners} managed idle runners.`);
logger.info(`Pool will not be topped up. Found ${numberOfRunnersInPool} managed idle runners.`);
}
}

Expand All @@ -101,3 +128,9 @@ async function getInstallationId(ghesApiUrl: string, org: string): Promise<numbe
})
).data.id;
}

function bootTimeExceeded(ec2Runner: RunnerList): boolean {
const runnerBootTimeInMinutes = process.env.RUNNER_BOOT_TIME_IN_MINUTES;
const launchTimePlusBootTime = moment(ec2Runner.launchTime).utc().add(runnerBootTimeInMinutes, 'minutes');
return launchTimePlusBootTime < moment(new Date()).utc();
}
1 change: 1 addition & 0 deletions modules/runners/pool.tf
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ module "pool" {
runner = {
disable_runner_autoupdate = var.disable_runner_autoupdate
ephemeral = var.enable_ephemeral_runners
boot_time_in_minutes = var.runner_boot_time_in_minutes
extra_labels = var.runner_extra_labels
launch_template = aws_launch_template.runner
group_name = var.runner_group_name
Expand Down
1 change: 1 addition & 0 deletions modules/runners/pool/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ resource "aws_lambda_function" "pool" {
NODE_TLS_REJECT_UNAUTHORIZED = var.config.ghes.url != null && !var.config.ghes.ssl_verify ? 0 : 1
PARAMETER_GITHUB_APP_ID_NAME = var.config.github_app_parameters.id.name
PARAMETER_GITHUB_APP_KEY_BASE64_NAME = var.config.github_app_parameters.key_base64.name
RUNNER_BOOT_TIME_IN_MINUTES = var.config.runner.boot_time_in_minutes
RUNNER_EXTRA_LABELS = var.config.runner.extra_labels
RUNNER_GROUP_NAME = var.config.runner.group_name
RUNNER_OWNER = var.config.runner.pool_owner
Expand Down
1 change: 1 addition & 0 deletions modules/runners/pool/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ variable "config" {
runner = object({
disable_runner_autoupdate = bool
ephemeral = bool
boot_time_in_minutes = number
extra_labels = string
launch_template = object({
name = string
Expand Down

0 comments on commit 9f841f7

Please sign in to comment.