From d784840ac180911b742f2b10a1f16038b3dc2dd5 Mon Sep 17 00:00:00 2001 From: Chris Roberson Date: Mon, 26 Oct 2020 15:47:56 -0400 Subject: [PATCH] [Monitoring] Fix a couple of issues with the cpu usage alert (#80737) * Fix a couple of issues with the cpu usage alert * Fix tests * PR feedback --- .../components/elasticsearch/node/advanced.js | 21 ++++++- .../elasticsearch/node/advanced/index.js | 1 + .../server/alerts/cpu_usage_alert.ts | 7 +-- .../alerts/fetch_cpu_usage_node_stats.test.ts | 56 ++++++++++++------ .../lib/alerts/fetch_cpu_usage_node_stats.ts | 57 ++++++++++++++----- 5 files changed, 104 insertions(+), 38 deletions(-) diff --git a/x-pack/plugins/monitoring/public/components/elasticsearch/node/advanced.js b/x-pack/plugins/monitoring/public/components/elasticsearch/node/advanced.js index b2a17515bbb96..67df745e619d4 100644 --- a/x-pack/plugins/monitoring/public/components/elasticsearch/node/advanced.js +++ b/x-pack/plugins/monitoring/public/components/elasticsearch/node/advanced.js @@ -18,8 +18,9 @@ import { import { NodeDetailStatus } from '../node_detail_status'; import { MonitoringTimeseriesContainer } from '../../chart'; import { FormattedMessage } from '@kbn/i18n/react'; +import { AlertsCallout } from '../../../alerts/callout'; -export const AdvancedNode = ({ nodeSummary, metrics, alerts, ...props }) => { +export const AdvancedNode = ({ nodeSummary, metrics, alerts, nodeId, ...props }) => { const metricsToShow = [ metrics.node_gc, metrics.node_gc_time, @@ -50,9 +51,25 @@ export const AdvancedNode = ({ nodeSummary, metrics, alerts, ...props }) => { - + + state.nodeId === nodeId || state.stackProductUuid === nodeId + } + /> + state.nodeId === nodeId || state.stackProductUuid === nodeId} + nextStepsFilter={(nextStep) => { + if (nextStep.text.includes('Elasticsearch nodes')) { + return false; + } + return true; + }} + /> {metricsToShow.map((metric, index) => ( diff --git a/x-pack/plugins/monitoring/public/views/elasticsearch/node/advanced/index.js b/x-pack/plugins/monitoring/public/views/elasticsearch/node/advanced/index.js index 03c0714864f92..8021ae7e5f63c 100644 --- a/x-pack/plugins/monitoring/public/views/elasticsearch/node/advanced/index.js +++ b/x-pack/plugins/monitoring/public/views/elasticsearch/node/advanced/index.js @@ -117,6 +117,7 @@ uiRoutes.when('/elasticsearch/nodes/:node/advanced', { { - let cpuUsage = 0; if (this.config.ui.container.elasticsearch.enabled) { - cpuUsage = + stat.cpuUsage = (stat.containerUsage / (stat.containerPeriods * stat.containerQuota * 1000)) * 100; - } else { - cpuUsage = stat.cpuUsage; } return { instanceKey: `${stat.clusterUuid}:${stat.nodeId}`, clusterUuid: stat.clusterUuid, - shouldFire: cpuUsage > params.threshold, + shouldFire: stat.cpuUsage > params.threshold, severity: AlertSeverity.Danger, meta: stat, ccs: stat.ccs, diff --git a/x-pack/plugins/monitoring/server/lib/alerts/fetch_cpu_usage_node_stats.test.ts b/x-pack/plugins/monitoring/server/lib/alerts/fetch_cpu_usage_node_stats.test.ts index 12926a30efa1b..88035c1121848 100644 --- a/x-pack/plugins/monitoring/server/lib/alerts/fetch_cpu_usage_node_stats.test.ts +++ b/x-pack/plugins/monitoring/server/lib/alerts/fetch_cpu_usage_node_stats.test.ts @@ -97,11 +97,18 @@ describe('fetchCpuUsageNodeStats', () => { }, ], }, - average_usage: { - value: 10, - }, - average_periods: { - value: 5, + histo: { + buckets: [ + null, + { + usage_deriv: { + normalized_value: 10, + }, + periods_deriv: { + normalized_value: 5, + }, + }, + ], }, average_quota: { value: 50, @@ -185,14 +192,14 @@ describe('fetchCpuUsageNodeStats', () => { }); await fetchCpuUsageNodeStats(callCluster, clusters, index, startMs, endMs, size); expect(params).toStrictEqual({ - index, + index: '.monitoring-es-*', filterPath: ['aggregations'], body: { size: 0, query: { bool: { filter: [ - { terms: { cluster_uuid: clusters.map((cluster) => cluster.clusterUuid) } }, + { terms: { cluster_uuid: ['abc123'] } }, { term: { type: 'node_stats' } }, { range: { timestamp: { format: 'epoch_millis', gte: 0, lte: 0 } } }, ], @@ -200,23 +207,38 @@ describe('fetchCpuUsageNodeStats', () => { }, aggs: { clusters: { - terms: { - field: 'cluster_uuid', - size, - include: clusters.map((cluster) => cluster.clusterUuid), - }, + terms: { field: 'cluster_uuid', size: 10, include: ['abc123'] }, aggs: { nodes: { - terms: { field: 'node_stats.node_id', size }, + terms: { field: 'node_stats.node_id', size: 10 }, aggs: { index: { terms: { field: '_index', size: 1 } }, average_cpu: { avg: { field: 'node_stats.process.cpu.percent' } }, - average_usage: { avg: { field: 'node_stats.os.cgroup.cpuacct.usage_nanos' } }, - average_periods: { - avg: { field: 'node_stats.os.cgroup.cpu.stat.number_of_elapsed_periods' }, - }, average_quota: { avg: { field: 'node_stats.os.cgroup.cpu.cfs_quota_micros' } }, name: { terms: { field: 'source_node.name', size: 1 } }, + histo: { + date_histogram: { field: 'timestamp', fixed_interval: '0m' }, + aggs: { + average_periods: { + max: { field: 'node_stats.os.cgroup.cpu.stat.number_of_elapsed_periods' }, + }, + average_usage: { max: { field: 'node_stats.os.cgroup.cpuacct.usage_nanos' } }, + usage_deriv: { + derivative: { + buckets_path: 'average_usage', + gap_policy: 'skip', + unit: '1s', + }, + }, + periods_deriv: { + derivative: { + buckets_path: 'average_periods', + gap_policy: 'skip', + unit: '1s', + }, + }, + }, + }, }, }, }, diff --git a/x-pack/plugins/monitoring/server/lib/alerts/fetch_cpu_usage_node_stats.ts b/x-pack/plugins/monitoring/server/lib/alerts/fetch_cpu_usage_node_stats.ts index 4fdb03b61950e..ecd324c083a8c 100644 --- a/x-pack/plugins/monitoring/server/lib/alerts/fetch_cpu_usage_node_stats.ts +++ b/x-pack/plugins/monitoring/server/lib/alerts/fetch_cpu_usage_node_stats.ts @@ -4,6 +4,8 @@ * you may not use this file except in compliance with the Elastic License. */ import { get } from 'lodash'; +import moment from 'moment'; +import { NORMALIZED_DERIVATIVE_UNIT } from '../../../common/constants'; import { AlertCluster, AlertCpuUsageNodeStats } from '../../alerts/types'; interface NodeBucketESResponse { @@ -26,6 +28,9 @@ export async function fetchCpuUsageNodeStats( endMs: number, size: number ): Promise { + // Using pure MS didn't seem to work well with the date_histogram interval + // but minutes does + const intervalInMinutes = moment.duration(endMs - startMs).asMinutes(); const filterPath = ['aggregations']; const params = { index, @@ -82,16 +87,6 @@ export async function fetchCpuUsageNodeStats( field: 'node_stats.process.cpu.percent', }, }, - average_usage: { - avg: { - field: 'node_stats.os.cgroup.cpuacct.usage_nanos', - }, - }, - average_periods: { - avg: { - field: 'node_stats.os.cgroup.cpu.stat.number_of_elapsed_periods', - }, - }, average_quota: { avg: { field: 'node_stats.os.cgroup.cpu.cfs_quota_micros', @@ -103,6 +98,38 @@ export async function fetchCpuUsageNodeStats( size: 1, }, }, + histo: { + date_histogram: { + field: 'timestamp', + fixed_interval: `${intervalInMinutes}m`, + }, + aggs: { + average_periods: { + max: { + field: 'node_stats.os.cgroup.cpu.stat.number_of_elapsed_periods', + }, + }, + average_usage: { + max: { + field: 'node_stats.os.cgroup.cpuacct.usage_nanos', + }, + }, + usage_deriv: { + derivative: { + buckets_path: 'average_usage', + gap_policy: 'skip', + unit: NORMALIZED_DERIVATIVE_UNIT, + }, + }, + periods_deriv: { + derivative: { + buckets_path: 'average_periods', + gap_policy: 'skip', + unit: NORMALIZED_DERIVATIVE_UNIT, + }, + }, + }, + }, }, }, }, @@ -120,17 +147,19 @@ export async function fetchCpuUsageNodeStats( ) as ClusterBucketESResponse[]; for (const clusterBucket of clusterBuckets) { for (const node of clusterBucket.nodes.buckets) { + const lastBucket = get(node, 'histo.buckets[1]', {}); const indexName = get(node, 'index.buckets[0].key', ''); - stats.push({ + const stat = { clusterUuid: clusterBucket.key, nodeId: node.key, nodeName: get(node, 'name.buckets[0].key'), cpuUsage: get(node, 'average_cpu.value'), - containerUsage: get(node, 'average_usage.value'), - containerPeriods: get(node, 'average_periods.value'), + containerUsage: get(lastBucket, 'usage_deriv.normalized_value'), + containerPeriods: get(lastBucket, 'periods_deriv.normalized_value'), containerQuota: get(node, 'average_quota.value'), ccs: indexName.includes(':') ? indexName.split(':')[0] : null, - }); + }; + stats.push(stat); } } return stats;