audit: Disable auditing in recovery mode

Fixes: #15226 Audit logging is not able to function in recovery mode as produce messages are rejected. In this situation, it's possible the cluster may become unusable if audit messages are generated and the queues are unable to drain. Also fixed a logic error in the ducktape tests. Signed-off-by: Michael Boquard <michael@redpanda.com>
redpanda-data · Nov 30, 2023 · 6460a4b · 6460a4b
1 parent 398eccd
commit 6460a4b
Show file tree

Hide file tree

Showing 2 changed files with 48 additions and 2 deletions.
diff --git a/src/v/security/audit/audit_log_manager.cc b/src/v/security/audit/audit_log_manager.cc
@@ -741,6 +741,19 @@ bool audit_log_manager::is_audit_event_enabled(event_type event_type) const {
 }
 
 ss::future<> audit_log_manager::start() {
+    bool recovery_mode_enabled = config::node().recovery_mode_enabled.value();
+    if (recovery_mode_enabled) {
+        vlog(
+          adtlog.warn,
+          "**************************************************************");
+        vlog(
+          adtlog.warn,
+          "Redpanda is operating in recovery mode.  Auditing is disabled!");
+        vlog(
+          adtlog.warn,
+          "**************************************************************");
+        co_return;
+    }
     _probe = std::make_unique<audit_probe>();
     _probe->setup_metrics([this] {
         return static_cast<double>(pending_events())

diff --git a/tests/rptest/tests/audit_log_test.py b/tests/rptest/tests/audit_log_test.py
@@ -555,7 +555,7 @@ def ingest(self, records):
                     )
                     return
                 self.next_offset_ingest = len(records)
-                new_records = [json.loads(msg['value']) for msg in records]
+                new_records = [json.loads(msg['value']) for msg in new_records]
                 self.logger.info(f"Ingested: {len(new_records)} records")
                 self.logger.debug(f'Ingested records:')
                 for rec in new_records:
@@ -670,6 +670,39 @@ def test_drain_on_audit_disabled(self):
             lambda record_count: record_count == 3,
             "One stop event observed for shutdown node")
 
+    @cluster(num_nodes=5)
+    def test_recovery_mode(self):
+        """
+        Tests that audit logging does not start when in recovery mode
+        """
+
+        # Expect to find the audit system to come up
+        _ = self.find_matching_record(
+            partial(AuditLogTestsAppLifecycle.is_lifecycle_match,
+                    "Audit System", True),
+            lambda record_count: record_count == 3,
+            "Single redpanda audit start event per node")
+        self.redpanda.restart_nodes(
+            self.redpanda.nodes,
+            override_cfg_params={"recovery_mode_enabled": True})
+        wait_until(lambda: self.redpanda.search_log_any(
+            'Redpanda is operating in recovery mode.  Auditing is disabled!'),
+                   timeout_sec=30,
+                   backoff_sec=2,
+                   err_msg="Did not find expected log statement")
+        self.redpanda.restart_nodes(
+            self.redpanda.nodes,
+            override_cfg_params={"recovery_mode_enabled": False})
+        # Now we should see it 6 times, 3 times for initial boot, and 3 more times for this latest
+        # boot.  Seeing >6 would mean auditing somehow worked while in recovery mode
+        records = self.find_matching_record(
+            partial(AuditLogTestsAppLifecycle.is_lifecycle_match,
+                    "Audit System", True),
+            lambda record_count: record_count >= 6,
+            "Single redpanda audit start event per node")
+        assert len(
+            records) == 6, f'Expected 6 start up records, found {len(records)}'
+
 
 class AuditLogTestAdminApi(AuditLogTestBase):
     """Validates that audit logs are generated from admin API
@@ -1857,7 +1890,7 @@ def match_authn_user(user, svc_name, result, record):
         _ = self.find_matching_record(
             lambda record: match_authn_user(self.username, self.
                                             sr_audit_svc_name, 1, record),
-            lambda record_count: record_count > 1, 'authn attempt in sr')
+            lambda record_count: record_count == 1, 'authn attempt in sr')
 
     @cluster(num_nodes=5)
     def test_sr_audit_bad_authn(self):