From 4dcd3f86b9a6bc6d24d9939dab00d803583d48d8 Mon Sep 17 00:00:00 2001 From: Adrian Serrano Date: Tue, 24 Nov 2020 10:23:48 +0100 Subject: [PATCH] [Auditbeat] Recover from errors in audit monitoring routine (#22673) The auditd module spawns a monitoring goroutine that fetches auditd status every 15s. Due to this routine using a single audit client, if an update fails (because a netlink message is late or other causes), the audit client can get out of sync with the stream, failing in all subsequent requests. For reasons that aren't 100% clear to me at the moment, this error condition leads to a lot of `[audit_send_repl]` (2.6.x) / `[audit_send_reply]` (3.x+) kernel threads being created. ``` ERROR [auditd] auditd/audit_linux.go:183 get status request failed:failed to get audit status ack: unexpected sequence number for reply (expected 6286 but got 6285) ``` ``` $ ps -ef [...] root 27790 2 0 12:52 ? 00:00:00 [audit_send_repl] root 27791 2 0 12:52 ? 00:00:00 [audit_send_repl] root 27792 2 0 12:52 ? 00:00:00 [audit_send_repl] root 27793 2 0 12:52 ? 00:00:00 [audit_send_repl] root 27794 2 0 12:52 ? 00:00:00 [audit_send_repl] root 27795 2 0 12:52 ? 00:00:00 [audit_send_repl] root 27796 2 0 12:52 ? 00:00:00 [audit_send_repl] root 27797 2 0 12:52 ? 00:00:00 [audit_send_repl] root 27798 2 0 12:52 ? 00:00:00 [audit_send_repl] root 27799 2 0 12:52 ? 00:00:00 [audit_send_repl] root 27800 2 0 12:52 ? 00:00:00 [audit_send_repl] root 27801 2 0 12:52 ? 00:00:00 [audit_send_repl] root 27802 2 0 12:52 ? 00:00:00 [audit_send_repl] root 27803 2 0 12:52 ? 00:00:00 [audit_send_repl] root 27804 2 0 12:52 ? 00:00:00 [audit_send_repl] root 27805 2 0 12:52 ? 00:00:00 [audit_send_repl] root 27806 2 0 12:52 ? 00:00:00 [audit_send_repl] root 27807 2 0 12:52 ? 00:00:00 [audit_send_repl] root 27808 2 0 12:52 ? 00:00:00 [audit_send_repl] [...] ``` This patch updates the error-handling logic to create a new audit client when a status update fails, allowing to recover and preventing the proliferation of `audit_send_repl` kernel threads. (cherry picked from commit ca9550f5c8243428c84d7a32843360875e6e3736) --- CHANGELOG.next.asciidoc | 1 + auditbeat/module/auditd/audit_linux.go | 15 ++++++++++++++- 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.next.asciidoc b/CHANGELOG.next.asciidoc index a08db17834f..480a09b17bc 100644 --- a/CHANGELOG.next.asciidoc +++ b/CHANGELOG.next.asciidoc @@ -131,6 +131,7 @@ https://github.com/elastic/beats/compare/v7.0.0-alpha2...master[Check the HEAD d - system/package: Fix an error that can occur while trying to persist package metadata. {issue}18536[18536] {pull}18887[18887] - system/socket: Fix dataset using 100% CPU and becoming unresponsive in some scenarios. {pull}19033[19033] {pull}19764[19764] - system/socket: Fixed tracking of long-running connections. {pull}19033[19033] +- auditd: Fix an error condition causing a lot of `audit_send_reply` kernel threads being created. {pull}22673[22673] *Filebeat* diff --git a/auditbeat/module/auditd/audit_linux.go b/auditbeat/module/auditd/audit_linux.go index 1586eaeaffa..a2c9e004877 100644 --- a/auditbeat/module/auditd/audit_linux.go +++ b/auditbeat/module/auditd/audit_linux.go @@ -163,7 +163,11 @@ func (ms *MetricSet) Run(reporter mb.PushReporterV2) { ms.log.Errorw("Failure creating audit monitoring client", "error", err) } go func() { - defer client.Close() + defer func() { // Close the most recently allocated "client" instance. + if client != nil { + client.Close() + } + }() timer := time.NewTicker(lostEventsUpdateInterval) defer timer.Stop() for { @@ -175,6 +179,15 @@ func (ms *MetricSet) Run(reporter mb.PushReporterV2) { ms.updateKernelLostMetric(status.Lost) } else { ms.log.Error("get status request failed:", err) + if err = client.Close(); err != nil { + ms.log.Errorw("Error closing audit monitoring client", "error", err) + } + client, err = libaudit.NewAuditClient(nil) + if err != nil { + ms.log.Errorw("Failure creating audit monitoring client", "error", err) + reporter.Error(err) + return + } } } }