Skip to content

Commit

Permalink
[Stack Monitoring] create alert per node instead of per cluster (#102544
Browse files Browse the repository at this point in the history
) (#103719)

* create alert per node instead of per cluster

* add comment

* fix test, replace alert state with empty array with no node is firing

* update cpu usage action messaging

* fix internationalization

* update disk usage rule action messaging

* update memory usage rule action messaging

* update other action messaging

* update missing monitoring data alert action messaging

* remove comment

* fix bug where threadpool alerts were not firing

* fix bug with threadpool rejections and update alert action messaging to be per node

* update comments

* unit test for thread pool write rejections alert

* update messaging for CCR read rejection

* fix cluster level alerts to use the cluster id when its not node level

* add more tests to nodes changed alert

* update default message

* update alert messaging for large shard size

* update default messaging

Co-authored-by: Kibana Machine <42973632+kibanamachine@users.noreply.github.com>

Co-authored-by: Kibana Machine <42973632+kibanamachine@users.noreply.github.com>
  • Loading branch information
neptunian and kibanamachine authored Jun 29, 2021
1 parent cccc869 commit 81f6f31
Show file tree
Hide file tree
Showing 24 changed files with 1,371 additions and 402 deletions.
5 changes: 2 additions & 3 deletions x-pack/plugins/monitoring/common/types/alerts.ts
Original file line number Diff line number Diff line change
Expand Up @@ -97,11 +97,9 @@ export interface AlertMemoryUsageState extends AlertNodeState {
memoryUsage: number;
}

export interface AlertThreadPoolRejectionsState extends AlertState {
export interface AlertThreadPoolRejectionsState extends AlertNodeState {
rejectionCount: number;
type: string;
nodeId: string;
nodeName?: string;
}

export interface AlertLicenseState extends AlertState {
Expand Down Expand Up @@ -172,6 +170,7 @@ export interface AlertThreadPoolRejectionsStats {
nodeId: string;
nodeName: string;
rejectionCount: number;
type: string;
ccs?: string;
}

Expand Down
2 changes: 1 addition & 1 deletion x-pack/plugins/monitoring/server/alerts/alert_helpers.ts
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ export class AlertingDefaults {
clusterName: {
name: 'clusterName',
description: i18n.translate('xpack.monitoring.alerts.actionVariables.clusterName', {
defaultMessage: 'The cluster to which the nodes belong.',
defaultMessage: 'The cluster to which the node(s) belongs.',
}),
},
action: {
Expand Down
63 changes: 33 additions & 30 deletions x-pack/plugins/monitoring/server/alerts/base_alert.ts
Original file line number Diff line number Diff line change
Expand Up @@ -274,46 +274,49 @@ export class BaseAlert {
state: ExecutedState
) {
const currentUTC = +new Date();
// for each cluster filter the nodes that belong to this cluster
for (const cluster of clusters) {
const nodes = data.filter((node) => node.clusterUuid === cluster.clusterUuid);
if (!nodes.length) {
continue;
}

const firingNodeUuids = nodes
.filter((node) => node.shouldFire)
.map((node) => node.meta.nodeId || node.meta.instanceId)
.join(',');
const instanceId = `${this.alertOptions.id}:${cluster.clusterUuid}:${firingNodeUuids}`;
const instance = services.alertInstanceFactory(instanceId);
const newAlertStates: AlertNodeState[] = [];
const key = this.alertOptions.accessorKey;

// for each node, update the alert's state with node state
for (const node of nodes) {
if (!node.shouldFire) {
continue;
const newAlertStates: AlertNodeState[] = [];
// quick fix for now so that non node level alerts will use the cluster id
const instance = services.alertInstanceFactory(
node.meta.nodeId || node.meta.instanceId || cluster.clusterUuid
);

if (node.shouldFire) {
const { meta } = node;
// create a default alert state for this node and add data from node.meta and other data
const nodeState = this.getDefaultAlertState(cluster, node) as AlertNodeState;
if (key) {
nodeState[key] = meta[key];
}
nodeState.nodeId = meta.nodeId || node.nodeId! || meta.instanceId;
// TODO: make these functions more generic, so it's node/item agnostic
nodeState.nodeName = meta.itemLabel || meta.nodeName || node.nodeName || nodeState.nodeId;
nodeState.itemLabel = meta.itemLabel;
nodeState.meta = meta;
nodeState.ui.triggeredMS = currentUTC;
nodeState.ui.isFiring = true;
nodeState.ui.severity = node.severity;
nodeState.ui.message = this.getUiMessage(nodeState, node);
// store the state of each node in array.
newAlertStates.push(nodeState);
}
const { meta } = node;
const nodeState = this.getDefaultAlertState(cluster, node) as AlertNodeState;
if (key) {
nodeState[key] = meta[key];
const alertInstanceState = { alertStates: newAlertStates };
// update the alert's state with the new node states
instance.replaceState(alertInstanceState);
if (newAlertStates.length) {
this.executeActions(instance, alertInstanceState, null, cluster);
state.lastExecutedAction = currentUTC;
}
nodeState.nodeId = meta.nodeId || node.nodeId! || meta.instanceId;
// TODO: make these functions more generic, so it's node/item agnostic
nodeState.nodeName = meta.itemLabel || meta.nodeName || node.nodeName || nodeState.nodeId;
nodeState.itemLabel = meta.itemLabel;
nodeState.meta = meta;
nodeState.ui.triggeredMS = currentUTC;
nodeState.ui.isFiring = true;
nodeState.ui.severity = node.severity;
nodeState.ui.message = this.getUiMessage(nodeState, node);
newAlertStates.push(nodeState);
}

const alertInstanceState = { alertStates: newAlertStates };
instance.replaceState(alertInstanceState);
if (newAlertStates.length) {
this.executeActions(instance, alertInstanceState, null, cluster);
state.lastExecutedAction = currentUTC;
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -240,7 +240,7 @@ export class CCRReadExceptionsAlert extends BaseAlert {
'xpack.monitoring.alerts.ccrReadExceptions.shortAction',
{
defaultMessage:
'Verify follower and leader index relationships across the affected remote clusters.',
'Verify follower and leader index relationships on the affected remote cluster.',
}
);
const fullActionText = i18n.translate('xpack.monitoring.alerts.ccrReadExceptions.fullAction', {
Expand All @@ -258,7 +258,7 @@ export class CCRReadExceptionsAlert extends BaseAlert {
const internalShortMessage = i18n.translate(
'xpack.monitoring.alerts.ccrReadExceptions.firing.internalShortMessage',
{
defaultMessage: `CCR read exceptions alert is firing for the following remote clusters: {remoteClustersList}. {shortActionText}`,
defaultMessage: `CCR read exceptions alert is firing for the following remote cluster: {remoteClustersList}. {shortActionText}`,
values: {
remoteClustersList,
shortActionText,
Expand All @@ -268,7 +268,7 @@ export class CCRReadExceptionsAlert extends BaseAlert {
const internalFullMessage = i18n.translate(
'xpack.monitoring.alerts.ccrReadExceptions.firing.internalFullMessage',
{
defaultMessage: `CCR read exceptions alert is firing for the following remote clusters: {remoteClustersList}. Current 'follower_index' indices are affected: {followerIndicesList}. {action}`,
defaultMessage: `CCR read exceptions alert is firing for the following remote cluster: {remoteClustersList}. Current 'follower_index' index affected: {followerIndicesList}. {action}`,
values: {
action,
remoteClustersList,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ describe('ClusterHealthAlert', () => {
description: 'The full internal message generated by Elastic.',
},
{ name: 'state', description: 'The current state of the alert.' },
{ name: 'clusterName', description: 'The cluster to which the nodes belong.' },
{ name: 'clusterName', description: 'The cluster to which the node(s) belongs.' },
{ name: 'action', description: 'The recommended action for this alert.' },
{
name: 'actionPlain',
Expand Down
28 changes: 14 additions & 14 deletions x-pack/plugins/monitoring/server/alerts/cpu_usage_alert.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -19,11 +19,11 @@ jest.mock('../lib/alerts/fetch_cpu_usage_node_stats', () => ({
jest.mock('../lib/alerts/fetch_clusters', () => ({
fetchClusters: jest.fn(),
}));

jest.mock('../static_globals', () => ({
Globals: {
app: {
getLogger: () => ({ debug: jest.fn() }),
url: 'http://localhost:5601',
config: {
ui: {
ccs: { enabled: true },
Expand All @@ -43,8 +43,7 @@ describe('CpuUsageAlert', () => {
expect(alert.alertOptions.throttle).toBe('1d');
expect(alert.alertOptions.defaultParams).toStrictEqual({ threshold: 85, duration: '5m' });
expect(alert.alertOptions.actionVariables).toStrictEqual([
{ name: 'nodes', description: 'The list of nodes reporting high cpu usage.' },
{ name: 'count', description: 'The number of nodes reporting high cpu usage.' },
{ name: 'node', description: 'The node reporting high cpu usage.' },
{
name: 'internalShortMessage',
description: 'The short internal message generated by Elastic.',
Expand All @@ -54,7 +53,7 @@ describe('CpuUsageAlert', () => {
description: 'The full internal message generated by Elastic.',
},
{ name: 'state', description: 'The current state of the alert.' },
{ name: 'clusterName', description: 'The cluster to which the nodes belong.' },
{ name: 'clusterName', description: 'The cluster to which the node(s) belongs.' },
{ name: 'action', description: 'The recommended action for this alert.' },
{
name: 'actionPlain',
Expand Down Expand Up @@ -140,8 +139,7 @@ describe('CpuUsageAlert', () => {
ui: {
isFiring: true,
message: {
text:
'Node #start_linkmyNodeName#end_link is reporting cpu usage of 91% at #absolute',
text: `Node #start_link${nodeName}#end_link is reporting cpu usage of ${cpuUsage}% at #absolute`,
nextSteps: [
{
text: '#start_linkCheck hot threads#end_link',
Expand Down Expand Up @@ -192,13 +190,14 @@ describe('CpuUsageAlert', () => {
],
});
expect(scheduleActions).toHaveBeenCalledWith('default', {
internalFullMessage: `CPU usage alert is firing for ${count} node(s) in cluster: ${clusterName}. [View nodes](elasticsearch/nodes)`,
internalShortMessage: `CPU usage alert is firing for ${count} node(s) in cluster: ${clusterName}. Verify CPU levels across affected nodes.`,
action: `[View nodes](elasticsearch/nodes)`,
actionPlain: 'Verify CPU levels across affected nodes.',
internalFullMessage: `CPU usage alert is firing for node ${nodeName} in cluster: ${clusterName}. [View node](http://localhost:5601/app/monitoring#/elasticsearch/nodes/${nodeId}?_g=(cluster_uuid:${clusterUuid}))`,
internalShortMessage: `CPU usage alert is firing for node ${nodeName} in cluster: ${clusterName}. Verify CPU level of node.`,
action: `[View node](http://localhost:5601/app/monitoring#/elasticsearch/nodes/${nodeId}?_g=(cluster_uuid:${clusterUuid}))`,
actionPlain: 'Verify CPU level of node.',
clusterName,
count,
nodes: `${nodeName}:${cpuUsage}`,
node: `${nodeName}:${cpuUsage}`,
state: 'firing',
});
});
Expand Down Expand Up @@ -242,13 +241,14 @@ describe('CpuUsageAlert', () => {
} as any);
const count = 1;
expect(scheduleActions).toHaveBeenCalledWith('default', {
internalFullMessage: `CPU usage alert is firing for ${count} node(s) in cluster: ${clusterName}. [View nodes](elasticsearch/nodes)`,
internalShortMessage: `CPU usage alert is firing for ${count} node(s) in cluster: ${clusterName}. Verify CPU levels across affected nodes.`,
action: `[View nodes](elasticsearch/nodes)`,
actionPlain: 'Verify CPU levels across affected nodes.',
internalFullMessage: `CPU usage alert is firing for node ${nodeName} in cluster: ${clusterName}. [View node](http://localhost:5601/app/monitoring#/elasticsearch/nodes/${nodeId}?_g=(cluster_uuid:${clusterUuid},ccs:${ccs}))`,
internalShortMessage: `CPU usage alert is firing for node ${nodeName} in cluster: ${clusterName}. Verify CPU level of node.`,
action: `[View node](http://localhost:5601/app/monitoring#/elasticsearch/nodes/${nodeId}?_g=(cluster_uuid:${clusterUuid},ccs:testCluster))`,
actionPlain: 'Verify CPU level of node.',
clusterName,
count,
nodes: `${nodeName}:${cpuUsage}`,
node: `${nodeName}:${cpuUsage}`,
state: 'firing',
});
});
Expand Down
109 changes: 55 additions & 54 deletions x-pack/plugins/monitoring/server/alerts/cpu_usage_alert.ts
Original file line number Diff line number Diff line change
Expand Up @@ -51,15 +51,9 @@ export class CpuUsageAlert extends BaseAlert {
},
actionVariables: [
{
name: 'nodes',
description: i18n.translate('xpack.monitoring.alerts.cpuUsage.actionVariables.nodes', {
defaultMessage: 'The list of nodes reporting high cpu usage.',
}),
},
{
name: 'count',
description: i18n.translate('xpack.monitoring.alerts.cpuUsage.actionVariables.count', {
defaultMessage: 'The number of nodes reporting high cpu usage.',
name: 'node',
description: i18n.translate('xpack.monitoring.alerts.cpuUsage.actionVariables.node', {
defaultMessage: 'The node reporting high cpu usage.',
}),
},
...Object.values(AlertingDefaults.ALERT_TYPE.context),
Expand Down Expand Up @@ -170,51 +164,58 @@ export class CpuUsageAlert extends BaseAlert {
if (alertStates.length === 0) {
return;
}

const firingNodes = alertStates.filter(
(alertState) => alertState.ui.isFiring
) as AlertCpuUsageState[];
const firingCount = firingNodes.length;
if (firingCount > 0) {
const shortActionText = i18n.translate('xpack.monitoring.alerts.cpuUsage.shortAction', {
defaultMessage: 'Verify CPU levels across affected nodes.',
});
const fullActionText = i18n.translate('xpack.monitoring.alerts.cpuUsage.fullAction', {
defaultMessage: 'View nodes',
});
const action = `[${fullActionText}](elasticsearch/nodes)`;
const internalShortMessage = i18n.translate(
'xpack.monitoring.alerts.cpuUsage.firing.internalShortMessage',
{
defaultMessage: `CPU usage alert is firing for {count} node(s) in cluster: {clusterName}. {shortActionText}`,
values: {
count: firingCount,
clusterName: cluster.clusterName,
shortActionText,
},
}
);
const internalFullMessage = i18n.translate(
'xpack.monitoring.alerts.cpuUsage.firing.internalFullMessage',
{
defaultMessage: `CPU usage alert is firing for {count} node(s) in cluster: {clusterName}. {action}`,
values: {
count: firingCount,
clusterName: cluster.clusterName,
action,
},
}
);
instance.scheduleActions('default', {
internalShortMessage,
internalFullMessage: Globals.app.isCloud ? internalShortMessage : internalFullMessage,
state: AlertingDefaults.ALERT_STATE.firing,
nodes: firingNodes.map(({ nodeName, cpuUsage }) => `${nodeName}:${cpuUsage}`).toString(),
count: firingCount,
clusterName: cluster.clusterName,
action,
actionPlain: shortActionText,
});
const firingNode = alertStates[0] as AlertCpuUsageState;
if (!firingNode || !firingNode.ui.isFiring) {
return;
}
const shortActionText = i18n.translate('xpack.monitoring.alerts.cpuUsage.shortAction', {
defaultMessage: 'Verify CPU level of node.',
});
const fullActionText = i18n.translate('xpack.monitoring.alerts.cpuUsage.fullAction', {
defaultMessage: 'View node',
});
const ccs = firingNode.ccs;
const globalStateLink = this.createGlobalStateLink(
`elasticsearch/nodes/${firingNode.nodeId}`,
cluster.clusterUuid,
ccs
);
const action = `[${fullActionText}](${globalStateLink})`;
const internalShortMessage = i18n.translate(
'xpack.monitoring.alerts.cpuUsage.firing.internalShortMessage',
{
defaultMessage: `CPU usage alert is firing for node {nodeName} in cluster: {clusterName}. {shortActionText}`,
values: {
clusterName: cluster.clusterName,
nodeName: firingNode.nodeName,
shortActionText,
},
}
);
const internalFullMessage = i18n.translate(
'xpack.monitoring.alerts.cpuUsage.firing.internalFullMessage',
{
defaultMessage: `CPU usage alert is firing for node {nodeName} in cluster: {clusterName}. {action}`,
values: {
clusterName: cluster.clusterName,
nodeName: firingNode.nodeName,
action,
},
}
);
instance.scheduleActions('default', {
internalShortMessage,
internalFullMessage: Globals.app.isCloud ? internalShortMessage : internalFullMessage,
state: AlertingDefaults.ALERT_STATE.firing,
/* continue to send "nodes" and "count" values for users before https://github.com/elastic/kibana/pull/102544
see https://github.com/elastic/kibana/issues/100136#issuecomment-865229431
*/
nodes: `${firingNode.nodeName}:${firingNode.cpuUsage}`,
count: 1,
node: `${firingNode.nodeName}:${firingNode.cpuUsage}`,
clusterName: cluster.clusterName,
action,
actionPlain: shortActionText,
});
}
}
Loading

0 comments on commit 81f6f31

Please sign in to comment.