Skip to content

Commit

Permalink
[eventLog] retry resource creation at initialization time (#136363) (#…
Browse files Browse the repository at this point in the history
…136647)

resolves #134098

Adds retry logic to the initialization of elasticsearch
resources, when Kibana starts up.  Recently, it seems
this has become a more noticeable error - that race
conditions occur where two Kibana's initializing a new
stack version will race to create the event log resources.

We believe we'll see the end of these issues with some
retries, chunked around the 4 resource-y sections of
the initialization code.

We're using [p-retry][] (which uses [retry][]), to do an
exponential backoff starting at 2s, then 4s, 8s, 16s,
with 4 retries (so 5 actual attempted calls).  Some
randomness is added, since there's a race on.

[p-retry]: https://github.com/sindresorhus/p-retry#p-retry
[retry]: https://github.com/tim-kos/node-retry#retry

Co-authored-by: Kibana Machine <42973632+kibanamachine@users.noreply.github.com>
(cherry picked from commit f6e4c2f)

Co-authored-by: Patrick Mueller <patrick.mueller@elastic.co>
  • Loading branch information
kibanamachine and pmuellr authored Jul 19, 2022
1 parent 6e0619f commit 5f681db
Show file tree
Hide file tree
Showing 4 changed files with 131 additions and 9 deletions.
3 changes: 3 additions & 0 deletions x-pack/plugins/event_log/server/es/context.mock.ts
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@ import { namesMock } from './names.mock';
import { IClusterClientAdapter } from './cluster_client_adapter';
import { clusterClientAdapterMock } from './cluster_client_adapter.mock';

export const MOCK_RETRY_DELAY = 20;

const createContextMock = () => {
const mock: jest.Mocked<EsContext> & {
esAdapter: jest.Mocked<IClusterClientAdapter>;
Expand All @@ -23,6 +25,7 @@ const createContextMock = () => {
waitTillReady: jest.fn(async () => true),
esAdapter: clusterClientAdapterMock.create(),
initialized: true,
retryDelay: MOCK_RETRY_DELAY,
};
return mock;
};
Expand Down
13 changes: 9 additions & 4 deletions x-pack/plugins/event_log/server/es/context.ts
Original file line number Diff line number Diff line change
Expand Up @@ -12,14 +12,17 @@ import { initializeEs } from './init';
import { ClusterClientAdapter, IClusterClientAdapter } from './cluster_client_adapter';
import { createReadySignal, ReadySignal } from '../lib/ready_signal';

export const RETRY_DELAY = 2000;

export interface EsContext {
logger: Logger;
esNames: EsNames;
esAdapter: IClusterClientAdapter;
readonly logger: Logger;
readonly esNames: EsNames;
readonly esAdapter: IClusterClientAdapter;
initialize(): void;
shutdown(): Promise<void>;
waitTillReady(): Promise<boolean>;
initialized: boolean;
readonly initialized: boolean;
readonly retryDelay: number;
}

export interface EsError {
Expand All @@ -44,12 +47,14 @@ class EsContextImpl implements EsContext {
public esAdapter: IClusterClientAdapter;
private readonly readySignal: ReadySignal<boolean>;
public initialized: boolean;
public readonly retryDelay: number;

constructor(params: EsContextCtorParams) {
this.logger = params.logger;
this.esNames = getEsNames(params.indexNameRoot, params.kibanaVersion);
this.readySignal = createReadySignal();
this.initialized = false;
this.retryDelay = RETRY_DELAY;
this.esAdapter = new ClusterClientAdapter({
logger: params.logger,
elasticsearchClientPromise: params.elasticsearchClientPromise,
Expand Down
90 changes: 89 additions & 1 deletion x-pack/plugins/event_log/server/es/init.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
* 2.0.
*/

import { contextMock } from './context.mock';
import { contextMock, MOCK_RETRY_DELAY } from './context.mock';
import { initializeEs, parseIndexAliases } from './init';

describe('initializeEs', () => {
Expand Down Expand Up @@ -454,3 +454,91 @@ describe('parseIndexAliases', () => {
]);
});
});

describe('retries', () => {
let esContext = contextMock.create();
// set up context APIs to return defaults indicating already created
beforeEach(() => {
esContext = contextMock.create();
esContext.esAdapter.getExistingLegacyIndexTemplates.mockResolvedValue({});
esContext.esAdapter.getExistingIndices.mockResolvedValue({});
esContext.esAdapter.getExistingIndexAliases.mockResolvedValue({});
esContext.esAdapter.doesIlmPolicyExist.mockResolvedValue(true);
esContext.esAdapter.doesIndexTemplateExist.mockResolvedValue(true);
esContext.esAdapter.doesAliasExist.mockResolvedValue(true);
});

test('createIlmPolicyIfNotExists with 1 retry', async () => {
esContext.esAdapter.doesIlmPolicyExist.mockRejectedValueOnce(new Error('retry 1'));

const timeStart = Date.now();
await initializeEs(esContext);
const timeElapsed = Date.now() - timeStart;

expect(timeElapsed).toBeGreaterThanOrEqual(MOCK_RETRY_DELAY);

expect(esContext.esAdapter.getExistingLegacyIndexTemplates).toHaveBeenCalledTimes(1);
expect(esContext.esAdapter.doesIlmPolicyExist).toHaveBeenCalledTimes(2);
expect(esContext.esAdapter.doesIndexTemplateExist).toHaveBeenCalledTimes(1);
expect(esContext.esAdapter.doesAliasExist).toHaveBeenCalledTimes(1);

const prefix = `eventLog initialization operation failed and will be retried: createIlmPolicyIfNotExists`;
expect(esContext.logger.warn).toHaveBeenCalledTimes(1);
expect(esContext.logger.warn).toHaveBeenCalledWith(`${prefix}; 4 more times; error: retry 1`);
});

test('createIndexTemplateIfNotExists with 2 retries', async () => {
esContext.esAdapter.doesIndexTemplateExist.mockRejectedValueOnce(new Error('retry 2a'));
esContext.esAdapter.doesIndexTemplateExist.mockRejectedValueOnce(new Error('retry 2b'));

const timeStart = Date.now();
await initializeEs(esContext);
const timeElapsed = Date.now() - timeStart;

expect(timeElapsed).toBeGreaterThanOrEqual(MOCK_RETRY_DELAY * (1 + 2));

expect(esContext.esAdapter.getExistingLegacyIndexTemplates).toHaveBeenCalledTimes(1);
expect(esContext.esAdapter.doesIlmPolicyExist).toHaveBeenCalledTimes(1);
expect(esContext.esAdapter.doesIndexTemplateExist).toHaveBeenCalledTimes(3);
expect(esContext.esAdapter.doesAliasExist).toHaveBeenCalledTimes(1);

const prefix = `eventLog initialization operation failed and will be retried: createIndexTemplateIfNotExists`;
expect(esContext.logger.warn).toHaveBeenCalledTimes(2);
expect(esContext.logger.warn).toHaveBeenCalledWith(`${prefix}; 4 more times; error: retry 2a`);
expect(esContext.logger.warn).toHaveBeenCalledWith(`${prefix}; 3 more times; error: retry 2b`);
});

test('createInitialIndexIfNotExists', async () => {
esContext.esAdapter.doesAliasExist.mockRejectedValueOnce(new Error('retry 5a'));
esContext.esAdapter.doesAliasExist.mockRejectedValueOnce(new Error('retry 5b'));
esContext.esAdapter.doesAliasExist.mockRejectedValueOnce(new Error('retry 5c'));
esContext.esAdapter.doesAliasExist.mockRejectedValueOnce(new Error('retry 5d'));
esContext.esAdapter.doesAliasExist.mockRejectedValueOnce(new Error('retry 5e'));
// make sure it only tries 5 times - this one should not be reported
esContext.esAdapter.doesAliasExist.mockRejectedValueOnce(new Error('retry 5f'));

const timeStart = Date.now();
await initializeEs(esContext);
const timeElapsed = Date.now() - timeStart;

expect(timeElapsed).toBeGreaterThanOrEqual(MOCK_RETRY_DELAY * (1 + 2 + 4 + 8));

expect(esContext.esAdapter.getExistingLegacyIndexTemplates).toHaveBeenCalledTimes(1);
expect(esContext.esAdapter.doesIlmPolicyExist).toHaveBeenCalledTimes(1);
expect(esContext.esAdapter.doesIndexTemplateExist).toHaveBeenCalledTimes(1);
expect(esContext.esAdapter.doesAliasExist).toHaveBeenCalledTimes(5);

const prefix = `eventLog initialization operation failed and will be retried: createInitialIndexIfNotExists`;
expect(esContext.logger.warn).toHaveBeenCalledTimes(5);
expect(esContext.logger.warn).toHaveBeenCalledWith(`${prefix}; 4 more times; error: retry 5a`);
expect(esContext.logger.warn).toHaveBeenCalledWith(`${prefix}; 3 more times; error: retry 5b`);
expect(esContext.logger.warn).toHaveBeenCalledWith(`${prefix}; 2 more times; error: retry 5c`);
expect(esContext.logger.warn).toHaveBeenCalledWith(`${prefix}; 1 more times; error: retry 5d`);
expect(esContext.logger.warn).toHaveBeenCalledWith(`${prefix}; 0 more times; error: retry 5e`);

expect(esContext.logger.error).toHaveBeenCalledTimes(1);
expect(esContext.logger.error).toHaveBeenCalledWith(
`error initializing elasticsearch resources: retry 5e`
);
});
});
34 changes: 30 additions & 4 deletions x-pack/plugins/event_log/server/es/init.ts
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,12 @@
import type * as estypes from '@elastic/elasticsearch/lib/api/typesWithBodyKey';
import { asyncForEach } from '@kbn/std';
import { groupBy } from 'lodash';
import pRetry, { FailedAttemptError } from 'p-retry';
import { getIlmPolicy, getIndexTemplate } from './documents';
import { EsContext } from './context';

const MAX_RETRY_DELAY = 30000;

export async function initializeEs(esContext: EsContext): Promise<boolean> {
esContext.logger.debug('initializing elasticsearch resources starting');

Expand All @@ -28,10 +31,33 @@ export async function initializeEs(esContext: EsContext): Promise<boolean> {
async function initializeEsResources(esContext: EsContext) {
const steps = new EsInitializationSteps(esContext);

await steps.setExistingAssetsToHidden();
await steps.createIlmPolicyIfNotExists();
await steps.createIndexTemplateIfNotExists();
await steps.createInitialIndexIfNotExists();
// today, setExistingAssetsToHidden() never throws, but just in case ...
await retry(steps.setExistingAssetsToHidden);
await retry(steps.createIlmPolicyIfNotExists);
await retry(steps.createIndexTemplateIfNotExists);
await retry(steps.createInitialIndexIfNotExists);

async function retry(stepMethod: () => Promise<void>): Promise<void> {
// call the step method with retry options via p-retry
await pRetry(() => stepMethod.call(steps), getRetryOptions(esContext, stepMethod.name));
}
}

function getRetryOptions(esContext: EsContext, operation: string) {
const logger = esContext.logger;
// should retry on the order of 2s, 4s, 8s, 16s
// see: https://github.com/tim-kos/node-retry#retryoperationoptions
return {
minTimeout: esContext.retryDelay,
maxTimeout: MAX_RETRY_DELAY,
retries: 4,
factor: 2,
randomize: true,
onFailedAttempt: (err: FailedAttemptError) => {
const message = `eventLog initialization operation failed and will be retried: ${operation}; ${err.retriesLeft} more times; error: ${err.message}`;
logger.warn(message);
},
};
}

export interface ParsedIndexAlias extends estypes.IndicesAliasDefinition {
Expand Down

0 comments on commit 5f681db

Please sign in to comment.