Skip to content

Commit

Permalink
[Fleet] Upgrade details telemetry (#173356)
Browse files Browse the repository at this point in the history
## Summary

Relates #162448

Added upgrade details telemetry, publishing to `fleet-agents index` in
telemetry cluster, each bucket as separate documents.
Implemented by doing a `multi_terms` aggregation to group the same
`target_version, state, error_msg` values together.
Do we also want to include the agent count in each bucket in the
telemetry event? @jlind23 @ycombinator

Note: since this task runs every hour, it will most likely capture the
`UPG_FAILED` states, since the other (success) states are temporarily on
the agent docs, and removed if the upgrade is successful.

E.g. 2 docs like the below become one telemetry event
```
// .fleet-agents
   upgrade_details: {
            target_version: '8.12.0',
            state: 'UPG_FAILED',
            metadata: {
              error_msg: 'Download failed',
            },
          },

// telemetry event
{
      target_version: '8.12.0',
      state: 'UPG_FAILED',
      error_msg: 'Download failed',
    }
```

To verify:
- start kibana 8.13-SNAPSHOT locally
- set an invalid agent download source in Fleet Settings
- enroll an agent version 8.12-SNAPSHOT
- upgrade to 8.13-SNAPSHOT with the API
```
POST kbn:/api/fleet/agents/<agent_id>/upgrade
  {
    "version": "8.13.0-SNAPSHOT",
    "force": true
  }
```
- wait 15m so that the upgrade goes to failed state
- wait up to 1h for the telemetry task to run (speed up locally by
setting a shorter interval in FleetUsageSender in kibana)
- verify in debug logs:
```
[2023-12-14T14:26:28.832+01:00][DEBUG][plugins.fleet] Agents upgrade details telemetry: [{"target_version":"8.13.0-SNAPSHOT","state":"UPG_FAILED","error_msg":"failed download of agent binary: unable to download package: 3 errors occurred:\n\t* package '/Library/Elastic/Agent/data/elastic-agent-f383c6/downloads/elastic-agent-8.13.0-SNAPSHOT-darwin-aarch64.tar.gz' not found: open /Library/Elastic/Agent/data/elastic-agent-f383c6/downloads/elastic-agent-8.13.0-SNAPSHOT-darwin-aarch64.tar.gz: no such file or directory\n\t* call to 'https://artifacts.elastic.co/downloads/dummy/beats/elastic-agent/elastic-agent-8.13.0-SNAPSHOT-darwin-aarch64.tar.gz' returned unsuccessful status code: 404\n\t* call to 'https://artifacts.elastic.co/downloads/dummy/beats/elastic-agent/elastic-agent-8.13.0-SNAPSHOT-darwin-aarch64.tar.gz' returned unsuccessful status code: 404\n\n"}]
```

### Checklist

- [x] [Unit or functional
tests](https://www.elastic.co/guide/en/kibana/master/development-tests.html)
were updated or added to match the most common scenarios
  • Loading branch information
juliaElastic authored Dec 18, 2023
1 parent b8a23eb commit a61b864
Show file tree
Hide file tree
Showing 5 changed files with 119 additions and 3 deletions.
34 changes: 34 additions & 0 deletions x-pack/plugins/fleet/server/collectors/agent_collectors.ts
Original file line number Diff line number Diff line change
Expand Up @@ -75,13 +75,20 @@ export interface AgentData {
version: string;
count: number;
}>;
upgrade_details: Array<{
target_version: string;
state: string;
error_msg: string;
agent_count: number;
}>;
}

const DEFAULT_AGENT_DATA = {
agent_checkin_status: { error: 0, degraded: 0 },
agents_per_policy: [],
agents_per_version: [],
agents_per_os: [],
upgrade_details: [],
};

export const getAgentData = async (
Expand Down Expand Up @@ -135,6 +142,23 @@ export const getAgentData = async (
],
},
},
upgrade_details: {
multi_terms: {
size: 1000,
terms: [
{
field: 'upgrade_details.target_version.keyword',
},
{
field: 'upgrade_details.state',
},
{
field: 'upgrade_details.metadata.error_msg.keyword',
missing: '',
},
],
},
},
},
},
{ signal: abortController.signal }
Expand Down Expand Up @@ -190,11 +214,21 @@ export const getAgentData = async (
count: bucket.doc_count,
}));

const upgradeDetails = ((response?.aggregations?.upgrade_details as any).buckets ?? []).map(
(bucket: any) => ({
target_version: bucket.key[0],
state: bucket.key[1],
error_msg: bucket.key[2],
agent_count: bucket.doc_count,
})
);

return {
agent_checkin_status: statuses,
agents_per_policy: agentsPerPolicy,
agents_per_version: agentsPerVersion,
agents_per_os: agentsPerOS,
upgrade_details: upgradeDetails,
};
} catch (error) {
if (error.statusCode === 404) {
Expand Down
4 changes: 3 additions & 1 deletion x-pack/plugins/fleet/server/collectors/agents_per_output.ts
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,9 @@ export async function getAgentsPerOutput(
if (!outputTypeSupportPresets(output.type)) {
return;
}

if (!outputTypes[output.type]) {
return;
}
const outputTelemetryRecord = outputTypes[output.type];

if (!outputTelemetryRecord.preset_counts) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -146,6 +146,13 @@ describe('fleet usage telemetry', () => {
status: 'HEALTHY',
},
],
upgrade_details: {
target_version: '8.12.0',
state: 'UPG_FAILED',
metadata: {
error_msg: 'Download failed',
},
},
},
{
create: {
Expand Down Expand Up @@ -176,6 +183,13 @@ describe('fleet usage telemetry', () => {
status: 'HEALTHY',
},
],
upgrade_details: {
target_version: '8.12.0',
state: 'UPG_FAILED',
metadata: {
error_msg: 'Agent crash detected',
},
},
},
{
create: {
Expand Down Expand Up @@ -220,6 +234,11 @@ describe('fleet usage telemetry', () => {
last_checkin: new Date(Date.now() - 1000 * 60 * 6).toISOString(),
active: true,
policy_id: 'policy2',
upgrade_details: {
target_version: '8.11.0',
state: 'UPG_ROLLBACK',
metadata: {},
},
},
{
create: {
Expand Down Expand Up @@ -557,5 +576,24 @@ describe('fleet usage telemetry', () => {
fleet_server_logs_top_errors: ['failed to unenroll offline agents'],
})
);
expect(usage?.upgrade_details.length).toBe(3);
expect(usage?.upgrade_details).toContainEqual({
target_version: '8.12.0',
state: 'UPG_FAILED',
error_msg: 'Download failed',
agent_count: 1,
});
expect(usage?.upgrade_details).toContainEqual({
target_version: '8.12.0',
state: 'UPG_FAILED',
error_msg: 'Agent crash detected',
agent_count: 1,
});
expect(usage?.upgrade_details).toContainEqual({
target_version: '8.11.0',
state: 'UPG_ROLLBACK',
error_msg: '',
agent_count: 1,
});
});
});
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ const FLEET_AGENTS_EVENT_TYPE = 'fleet_agents';

export class FleetUsageSender {
private taskManager?: TaskManagerStartContract;
private taskVersion = '1.1.3';
private taskVersion = '1.1.4';
private taskType = 'Fleet-Usage-Sender';
private wasStarted: boolean = false;
private interval = '1h';
Expand Down Expand Up @@ -83,6 +83,7 @@ export class FleetUsageSender {
const {
agents_per_version: agentsPerVersion,
agents_per_output_type: agentsPerOutputType,
upgrade_details: upgradeDetails,
...fleetUsageData
} = usageData;
appContextService
Expand All @@ -106,6 +107,13 @@ export class FleetUsageSender {
agents_per_output_type: byOutputType,
});
});

appContextService
.getLogger()
.debug('Agents upgrade details telemetry: ' + JSON.stringify(upgradeDetails));
upgradeDetails.forEach((upgradeDetailsObj) => {
core.analytics.reportEvent(FLEET_AGENTS_EVENT_TYPE, { upgrade_details: upgradeDetailsObj });
});
} catch (error) {
appContextService
.getLogger()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -76,9 +76,10 @@ export const fleetAgentsSchema: RootSchema<any> = {
description: 'Output type used by agent',
},
},
presets_counts: {
preset_counts: {
_meta: {
description: 'Count of agents per preset',
optional: true,
},
properties: {
balanced: {
Expand Down Expand Up @@ -117,6 +118,7 @@ export const fleetAgentsSchema: RootSchema<any> = {
type: 'keyword',
_meta: {
description: 'Output preset used by agent, if applicable',
optional: true,
},
},
count_as_data: {
Expand All @@ -133,6 +135,38 @@ export const fleetAgentsSchema: RootSchema<any> = {
},
},
},
upgrade_details: {
_meta: {
description: 'Agent upgrade details telemetry',
optional: true,
},
properties: {
target_version: {
type: 'keyword',
_meta: {
description: 'Target version of the agent upgrade',
},
},
state: {
type: 'keyword',
_meta: {
description: 'State of the agent upgrade',
},
},
error_msg: {
type: 'keyword',
_meta: {
description: 'Error message of the agent upgrade if failed',
},
},
agent_count: {
type: 'long',
_meta: {
description: 'How many agents have this upgrade details',
},
},
},
},
};

export const fleetUsagesSchema: RootSchema<any> = {
Expand Down

0 comments on commit a61b864

Please sign in to comment.