Skip to content

Commit

Permalink
fix: change query for uptime stat panel (#840)
Browse files Browse the repository at this point in the history
* fix: change query for uptime stat panel

* fix: add new version of uptime query but keep the old one

New one will be behind a feature flag

* feat: add new uptime query feature flag

* feat: choose uptime query version based on feature flag value
  • Loading branch information
VikaCep authored Aug 23, 2024
1 parent a616a55 commit 90b81dd
Show file tree
Hide file tree
Showing 10 changed files with 62 additions and 35 deletions.
21 changes: 12 additions & 9 deletions src/page/DashboardPage.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,10 @@ import { useParams } from 'react-router-dom';
import { SceneApp, SceneAppPage } from '@grafana/scenes';
import { Spinner } from '@grafana/ui';

import { CheckPageParams, CheckType, DashboardSceneAppConfig } from 'types';
import { CheckPageParams, CheckType, DashboardSceneAppConfig, FeatureName } from 'types';
import { getCheckType } from 'utils';
import { useChecks } from 'data/useChecks';
import { useFeatureFlag } from 'hooks/useFeatureFlag';
import { useLogsDS } from 'hooks/useLogsDS';
import { useMetricsDS } from 'hooks/useMetricsDS';
import { useSMDS } from 'hooks/useSMDS';
Expand All @@ -28,6 +29,8 @@ function DashboardPageContent() {

const checkToView = checks.find((check) => String(check.id) === id);

const newUptimeQuery = useFeatureFlag(FeatureName.UptimeQueryV2)?.isEnabled;

const scene = useMemo(() => {
const metricsDef = {
uid: metricsDS?.uid,
Expand Down Expand Up @@ -56,7 +59,7 @@ function DashboardPageContent() {
new SceneAppPage({
title: checkToView.job,
url,
getScene: getDNSScene(config, [checkToView]),
getScene: getDNSScene(config, [checkToView], newUptimeQuery),
}),
],
});
Expand All @@ -67,7 +70,7 @@ function DashboardPageContent() {
new SceneAppPage({
title: checkToView.job,
url,
getScene: getHTTPScene(config, [checkToView]),
getScene: getHTTPScene(config, [checkToView], newUptimeQuery),
}),
],
});
Expand All @@ -78,7 +81,7 @@ function DashboardPageContent() {
new SceneAppPage({
title: checkToView.job,
url,
getScene: getBrowserScene(config, [checkToView], checkType),
getScene: getBrowserScene(config, [checkToView], checkType, newUptimeQuery),
}),
],
});
Expand All @@ -90,7 +93,7 @@ function DashboardPageContent() {
new SceneAppPage({
title: checkToView.job,
url,
getScene: getScriptedScene(config, [checkToView], checkType),
getScene: getScriptedScene(config, [checkToView], checkType, newUptimeQuery),
}),
],
});
Expand All @@ -101,7 +104,7 @@ function DashboardPageContent() {
new SceneAppPage({
title: checkToView.job,
url,
getScene: getPingScene(config, [checkToView]),
getScene: getPingScene(config, [checkToView], newUptimeQuery),
}),
],
});
Expand All @@ -112,7 +115,7 @@ function DashboardPageContent() {
new SceneAppPage({
title: checkToView.job,
url,
getScene: getTcpScene(config, [checkToView]),
getScene: getTcpScene(config, [checkToView], newUptimeQuery),
}),
],
});
Expand All @@ -135,13 +138,13 @@ function DashboardPageContent() {
new SceneAppPage({
title: checkToView.job,
url,
getScene: getGRPCScene(config, [checkToView]),
getScene: getGRPCScene(config, [checkToView], newUptimeQuery),
}),
],
});
}
}
}, [smDS, metricsDS, logsDS, checkToView]);
}, [smDS, metricsDS, logsDS, checkToView, newUptimeQuery]);

if (!scene || isLoading) {
return <Spinner />;
Expand Down
5 changes: 3 additions & 2 deletions src/scenes/BROWSER/browserScene.ts
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,8 @@ import { getProbeDuration } from './probeDuration';
export function getBrowserScene(
{ metrics, logs, singleCheckMode }: DashboardSceneAppConfig,
checks: Check[] = [],
checkType: CheckType
checkType: CheckType,
newUptimeQuery = false,
) {
return () => {
if (checks.length === 0) {
Expand All @@ -49,7 +50,7 @@ export function getBrowserScene(
const minStep = getMinStepFromFrequency(checks?.[0]?.frequency);

const reachability = getReachabilityStat(metrics, minStep);
const uptime = getUptimeStat(metrics, minStep);
const uptime = getUptimeStat(metrics, minStep, newUptimeQuery);

const distinctTargets = getDistinctTargets(metrics);
const probeDuration = getProbeDuration(metrics);
Expand Down
45 changes: 33 additions & 12 deletions src/scenes/Common/uptimeStat.ts
Original file line number Diff line number Diff line change
Expand Up @@ -13,24 +13,45 @@ function getMinStep(minStep: string) {
}
}

function getQueryRunner(metrics: DataSourceRef, minStep: string) {
function getQueryRunner(metrics: DataSourceRef, minStep: string, newUptimeQuery: boolean) {
// The min step for most queries is a minimum of 1 minute. For uptime, however, we want to make sure we have steps of at least 5 minutes in order for the math to work out.
const uptimeMinStep = getMinStep(minStep);

const uptimeCalculationQueryV1 = `# the inner query is going to produce a non-zero value if there was at least one successful check during the 5 minute window
# so make it a 1 if there was at least one success and a 0 otherwise
ceil(
# the number of successes across all probes
sum by (instance, job) (increase(probe_all_success_sum{instance="$instance", job="$job"}[$__rate_interval]))
/
# the total number of times we checked across all probes
(sum by (instance, job) (increase(probe_all_success_count{instance="$instance", job="$job"}[$__rate_interval])) + 1) # + 1 because we want to make sure it goes to 1, not 2
)`;

//The query to calculate the uptime doesn't return the expected result in all cases
//For this reason we created a new version that we'll be progressively rolling out
//See https://github.com/grafana/support-escalations/issues/11197#issuecomment-2307435564 for context and details.
const uptimeCalculationQueryV2 = `floor(
# Report a 1 if there's a location where most observations were successful and 0 if most observations failed for all probes.
max by (instance, job) (
round(
# the number of successes for each probe
(increase(probe_all_success_sum{instance="$instance", job="$job"}[$__rate_interval]))
/
# the total number of times we checked for each probe
((increase(probe_all_success_count{instance="$instance", job="$job"}[$__rate_interval])))
)
)
)`;

const uptimeQuery = newUptimeQuery ? uptimeCalculationQueryV2 : uptimeCalculationQueryV1;

const runner = new SceneQueryRunner({
datasource: metrics,
queries: [
{
editorMode: 'code',
exemplar: true,
expr: `# the inner query is going to produce a non-zero value if there was at least one successful check during the 5 minute window
# so make it a 1 if there was at least one success and a 0 otherwise
ceil(
# the number of successes across all probes
sum by (instance, job) (increase(probe_all_success_sum{instance="$instance", job="$job"}[$__rate_interval]))
/
# the total number of times we checked across all probes
(sum by (instance, job) (increase(probe_all_success_count{instance="$instance", job="$job"}[$__rate_interval])) + 1) # + 1 because we want to make sure it goes to 1, not 2
)`,
expr: uptimeQuery,
hide: false,
instant: false,
interval: uptimeMinStep,
Expand All @@ -54,12 +75,12 @@ function getQueryRunner(metrics: DataSourceRef, minStep: string) {
});
}

export function getUptimeStat(metrics: DataSourceRef, minStep: string) {
export function getUptimeStat(metrics: DataSourceRef, minStep: string, newUptimeQuery = false) {
return new ExplorablePanel({
pluginId: 'stat',
title: 'Uptime',
description: UPTIME_DESCRIPTION,
$data: getQueryRunner(metrics, minStep),
$data: getQueryRunner(metrics, minStep, newUptimeQuery),
fieldConfig: {
defaults: {
decimals: 2,
Expand Down
4 changes: 2 additions & 2 deletions src/scenes/DNS/dnsScene.ts
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ import { getMinStepFromFrequency } from 'scenes/utils';
import { getAnswerRecordsStat } from './answerRecords';
import { getResourcesRecordsPanel } from './resourceRecords';

export function getDNSScene({ metrics, logs, singleCheckMode }: DashboardSceneAppConfig, checks: Check[]) {
export function getDNSScene({ metrics, logs, singleCheckMode }: DashboardSceneAppConfig, checks: Check[], newUptimeQuery = false) {
return () => {
if (checks.length === 0) {
return getEmptyScene(CheckType.DNS);
Expand All @@ -48,7 +48,7 @@ export function getDNSScene({ metrics, logs, singleCheckMode }: DashboardSceneAp

const minStep = getMinStepFromFrequency(checks?.[0]?.frequency);
const errorMap = getErrorRateMapPanel(metrics, minStep);
const uptime = getUptimeStat(metrics, minStep);
const uptime = getUptimeStat(metrics, minStep, newUptimeQuery);
const reachability = getReachabilityStat(metrics, minStep);
const avgLatency = getAvgLatencyStat(metrics, minStep);
const answerRecords = getAnswerRecordsStat(metrics);
Expand Down
4 changes: 2 additions & 2 deletions src/scenes/GRPC/getGRPCScene.ts
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ import { getMinStepFromFrequency } from '../utils';

// This is a placeholder scene for GRPC checks (basically a copy of the TCP scene)
// TODO: Implement the actual GRPC scene
export function getGRPCScene({ metrics, logs, singleCheckMode }: DashboardSceneAppConfig, checks: Check[]) {
export function getGRPCScene({ metrics, logs, singleCheckMode }: DashboardSceneAppConfig, checks: Check[], newUptimeQuery = false) {
return () => {
if (checks.length === 0) {
return getEmptyScene(CheckType.GRPC);
Expand All @@ -46,7 +46,7 @@ export function getGRPCScene({ metrics, logs, singleCheckMode }: DashboardSceneA
const variables = new SceneVariableSet({ variables: [probe, job, instance] });
const minStep = getMinStepFromFrequency(checks?.[0]?.frequency);
const errorMap = getErrorRateMapPanel(metrics, minStep);
const uptime = getUptimeStat(metrics, minStep);
const uptime = getUptimeStat(metrics, minStep, newUptimeQuery);
const reachability = getReachabilityStat(metrics, minStep);
const avgLatency = getAvgLatencyStat(metrics, minStep);
const frequency = getFrequencyStat(metrics);
Expand Down
4 changes: 2 additions & 2 deletions src/scenes/HTTP/httpScene.ts
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ import {
import { getErrorRateTimeseries } from './errorRateTimeseries';
import { getLatencyByPhasePanel } from './latencyByPhase';

export function getHTTPScene({ metrics, logs, singleCheckMode }: DashboardSceneAppConfig, checks: Check[]) {
export function getHTTPScene({ metrics, logs, singleCheckMode }: DashboardSceneAppConfig, checks: Check[], newUptimeQuery = false) {
return () => {
const timeRange = new SceneTimeRange({
from: 'now-1h',
Expand All @@ -47,7 +47,7 @@ export function getHTTPScene({ metrics, logs, singleCheckMode }: DashboardSceneA
const variableSet = new SceneVariableSet({ variables: [probe, job, instance] });

const mapPanel = getErrorRateMapPanel(metrics, minStep);
const uptime = getUptimeStat(metrics, minStep);
const uptime = getUptimeStat(metrics, minStep, newUptimeQuery);
const reachability = getReachabilityStat(metrics, minStep);
const avgLatency = getAvgLatencyStat(metrics, minStep);
const sslExpiryStat = getSSLExpiryStat(metrics);
Expand Down
4 changes: 2 additions & 2 deletions src/scenes/PING/pingScene.ts
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ import { getMinStepFromFrequency } from 'scenes/utils';

import { getLatencyByPhasePanel } from './latencyByPhase';

export function getPingScene({ metrics, logs, singleCheckMode }: DashboardSceneAppConfig, checks: Check[]) {
export function getPingScene({ metrics, logs, singleCheckMode }: DashboardSceneAppConfig, checks: Check[], newUptimeQuery = false) {
return () => {
if (checks.length === 0) {
return getEmptyScene(CheckType.PING);
Expand All @@ -47,7 +47,7 @@ export function getPingScene({ metrics, logs, singleCheckMode }: DashboardSceneA

const minStep = getMinStepFromFrequency(checks?.[0]?.frequency);
const errorMap = getErrorRateMapPanel(metrics, minStep);
const uptime = getUptimeStat(metrics, minStep);
const uptime = getUptimeStat(metrics, minStep, newUptimeQuery);
const reachability = getReachabilityStat(metrics, minStep);
const avgLatency = getAvgLatencyStat(metrics, minStep);
const frequency = getFrequencyStat(metrics);
Expand Down
5 changes: 3 additions & 2 deletions src/scenes/SCRIPTED/scriptedScene.ts
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,8 @@ import { getProbeDuration } from './probeDuration';
export function getScriptedScene(
{ metrics, logs, singleCheckMode }: DashboardSceneAppConfig,
checks: Check[] = [],
checkType: CheckType
checkType: CheckType,
newUptimeQuery = false,
) {
return () => {
if (checks.length === 0) {
Expand All @@ -48,7 +49,7 @@ export function getScriptedScene(
const minStep = getMinStepFromFrequency(checks?.[0]?.frequency);

const reachability = getReachabilityStat(metrics, minStep);
const uptime = getUptimeStat(metrics, minStep);
const uptime = getUptimeStat(metrics, minStep, newUptimeQuery);

const distinctTargets = getDistinctTargets(metrics);
const probeDuration = getProbeDuration(metrics);
Expand Down
4 changes: 2 additions & 2 deletions src/scenes/TCP/getTcpScene.ts
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ import { getEmptyScene } from 'scenes/Common/emptyScene';
import { getErrorRateTimeseries } from 'scenes/HTTP/errorRateTimeseries';
import { getMinStepFromFrequency } from 'scenes/utils';

export function getTcpScene({ metrics, logs, singleCheckMode }: DashboardSceneAppConfig, checks: Check[]) {
export function getTcpScene({ metrics, logs, singleCheckMode }: DashboardSceneAppConfig, checks: Check[], newUptimeQuery = false) {
return () => {
if (checks.length === 0) {
return getEmptyScene(CheckType.TCP);
Expand All @@ -46,7 +46,7 @@ export function getTcpScene({ metrics, logs, singleCheckMode }: DashboardSceneAp

const minStep = getMinStepFromFrequency(checks?.[0]?.frequency);
const errorMap = getErrorRateMapPanel(metrics, minStep);
const uptime = getUptimeStat(metrics, minStep);
const uptime = getUptimeStat(metrics, minStep, newUptimeQuery);
const reachability = getReachabilityStat(metrics, minStep);
const avgLatency = getAvgLatencyStat(metrics, minStep);
const sslExpiry = getSSLExpiryStat(metrics);
Expand Down
1 change: 1 addition & 0 deletions src/types.ts
Original file line number Diff line number Diff line change
Expand Up @@ -648,6 +648,7 @@ export enum FeatureName {
GRPCChecks = 'grpc-checks',
ScriptedChecks = 'scripted-checks',
UnifiedAlerting = 'ngalert',
UptimeQueryV2 = 'uptime-query-v2',
__TURNOFF = 'test-only-do-not-use',
}

Expand Down

0 comments on commit 90b81dd

Please sign in to comment.