From bf6bc91ae98162e9504cb33fbffde62f5045211c Mon Sep 17 00:00:00 2001 From: Andrea Barbadoro Date: Fri, 10 May 2024 17:21:33 +0200 Subject: [PATCH 1/5] admin/server: fix rearm_log_level_timer _log_level_resets is a map, we want to extract the next expiring level_reset to rearm the _log_level_timer. previously we would extract the min_element based on the key (the logger name) so this sequence of actions set log-level kafka trace 10 set log-level raft trace 10 only kafka would be expired, because once expired the next run would pick again kafka, see that there is no expiration, and not rearm the timer. --- src/v/redpanda/admin/server.cc | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/src/v/redpanda/admin/server.cc b/src/v/redpanda/admin/server.cc index 421001641def..40c3d2a02488 100644 --- a/src/v/redpanda/admin/server.cc +++ b/src/v/redpanda/admin/server.cc @@ -661,11 +661,17 @@ void admin_server::log_exception( void admin_server::rearm_log_level_timer() { _log_level_timer.cancel(); - auto next = std::min_element( - _log_level_resets.begin(), _log_level_resets.end()); + if (_log_level_resets.empty()) { + return; + } - if (next != _log_level_resets.end() && next->second.expires.has_value()) { - _log_level_timer.arm(next->second.expires.value()); + auto reset_values = _log_level_resets | std::views::values; + auto& lvl_rst = *std::ranges::min_element( + reset_values, std::less<>{}, [](level_reset const& l) { + return l.expires.value_or(ss::timer<>::clock::time_point::max()); + }); + if (lvl_rst.expires.has_value()) { + _log_level_timer.arm(lvl_rst.expires.value()); } } From 0717cd1a0c9f352ad8a66d6cdf10ae58f36147a1 Mon Sep 17 00:00:00 2001 From: Andrea Barbadoro Date: Fri, 10 May 2024 17:31:07 +0200 Subject: [PATCH 2/5] admin/server: remove unused level_reset::operator< --- src/v/redpanda/admin/server.h | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/src/v/redpanda/admin/server.h b/src/v/redpanda/admin/server.h index 427520898b42..c49f6956a2c1 100644 --- a/src/v/redpanda/admin/server.h +++ b/src/v/redpanda/admin/server.h @@ -655,16 +655,6 @@ class admin_server { : level(level) , expires(expires) {} - friend bool operator<(const level_reset& l, const level_reset& r) { - if (!l.expires.has_value()) { - return false; - } else if (!r.expires.has_value()) { - return true; - } else { - return l.expires.value() < r.expires.value(); - } - } - ss::log_level level; std::optional expires; }; From 6593dbec194446702799c3223caeb688000019cf Mon Sep 17 00:00:00 2001 From: Andrea Barbadoro Date: Fri, 10 May 2024 17:32:59 +0200 Subject: [PATCH 3/5] tests/log_level_test: test_log_level_multiple_expiry the test shows that this sequence of actions set log-level admin-api-server trace 10 set log-level raft trace 10 works and both logger are reset after the expiry period --- tests/rptest/tests/log_level_test.py | 29 ++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/tests/rptest/tests/log_level_test.py b/tests/rptest/tests/log_level_test.py index 4c90b8aa258a..1c88c3b76879 100644 --- a/tests/rptest/tests/log_level_test.py +++ b/tests/rptest/tests/log_level_test.py @@ -7,11 +7,13 @@ # the Business Source License, use of this software will be governed # by the Apache License, Version 2.0 +import time import ducktape.errors import requests.exceptions import urllib.parse import json +from ducktape.mark import parametrize from ducktape.utils.util import wait_until from rptest.services.cluster import cluster from rptest.tests.redpanda_test import RedpandaTest @@ -101,6 +103,33 @@ def test_log_level_control(self): backoff_sec=1, err_msg="Never saw message") + @cluster(num_nodes=1) + @parametrize(loggers=("admin_api_server", "raft")) + @parametrize(loggers=("raft", "admin_api_server")) + def test_log_level_multiple_expiry(self, loggers=tuple[str, str]): + """ + Check that more than one logger can be in a modified level and be expired correctly + see https://redpandadata.atlassian.net/browse/CORE-96 + """ + admin = Admin(self.redpanda) + node = self.redpanda.nodes[0] + + first_logger, second_logger = loggers + # set two loggers to trace, expect that both of them expires in a timely fashion + with self.redpanda.monitor_log(node) as mon: + admin.set_log_level(first_logger, "trace", expires=10) + time.sleep(1) + admin.set_log_level(second_logger, "trace", expires=10) + mon.wait_until(f"Expiring log level for {{{first_logger}}}", + timeout_sec=15, + backoff_sec=1, + err_msg=f"Never saw Expiring for {first_logger}") + mon.wait_until(f"Expiring log level for {{{second_logger}}}", + timeout_sec=15, + backoff_sec=1, + err_msg=f"Never saw Expiring for {second_logger}") + + @cluster(num_nodes=3) def test_max_expiry(self): admin = Admin(self.redpanda) From d781fb13e6051bf3b01821b9faac4687fa69f1c6 Mon Sep 17 00:00:00 2001 From: Andrea Barbadoro Date: Fri, 10 May 2024 17:38:18 +0200 Subject: [PATCH 4/5] admin/server: ensure that a true expire-never logger is never expired previously, this sequence of actions set log-level raft trace 10 set log-level raft error 0 would result in raft logging at error level for only 10 seconds, despite the legit request to expire never. this is because in the code, the second request would not correctly reset the expire field if a previous name-level_reset was already in the map used for this operation --- src/v/redpanda/admin/server.cc | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/src/v/redpanda/admin/server.cc b/src/v/redpanda/admin/server.cc index 40c3d2a02488..f940cb2e742a 100644 --- a/src/v/redpanda/admin/server.cc +++ b/src/v/redpanda/admin/server.cc @@ -1422,17 +1422,20 @@ void admin_server::register_config_routes() { ss::global_logger_registry().set_logger_level(name, new_level); - // expires=0 is same as not specifying it at all - if (expires_v / 1s > 0) { - auto when = ss::timer<>::clock::now() + expires_v; - auto res = _log_level_resets.try_emplace(name, cur_level, when); - if (!res.second) { - res.first->second.expires = when; + auto when = [&]() -> std::optional { + // expires=0 is same as not specifying it at all + if (expires_v / 1s > 0) { + return ss::timer<>::clock::now() + expires_v; + } else { + // new log level never expires, but we still want an entry in + // the resets map as a record of the default + return std::nullopt; } - } else { - // new log level never expires, but we still want an entry in the - // resets map as a record of the default - _log_level_resets.try_emplace(name, cur_level, std::nullopt); + }(); + + auto res = _log_level_resets.try_emplace(name, cur_level, when); + if (!res.second) { + res.first->second.expires = when; } rsp.expiration = expires_v / 1s; From 84748a8dcb4931258c5b9ef0cb3a32b9f2b9202b Mon Sep 17 00:00:00 2001 From: Andrea Barbadoro Date: Fri, 10 May 2024 17:46:57 +0200 Subject: [PATCH 5/5] tests/log_level_test: test_log_level_persist_a_never_expire_request test that set log-level admin_api_server trace 10 set log-level admin_api_server error 0 never resets the logger to the info level --- tests/rptest/tests/log_level_test.py | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/tests/rptest/tests/log_level_test.py b/tests/rptest/tests/log_level_test.py index 1c88c3b76879..d0ff84908363 100644 --- a/tests/rptest/tests/log_level_test.py +++ b/tests/rptest/tests/log_level_test.py @@ -129,6 +129,33 @@ def test_log_level_multiple_expiry(self, loggers=tuple[str, str]): backoff_sec=1, err_msg=f"Never saw Expiring for {second_logger}") + @cluster(num_nodes=1) + def test_log_level_persist_a_never_expire_request(self): + """ + check that this sequence of actions + set log-level admin_api_server trace 10 + set log-level admin_api_server error 0 + + never resets the logger to the info level + """ + admin = Admin(self.redpanda) + node = self.redpanda.nodes[0] + + with self.redpanda.monitor_log(node) as mon: + admin.set_log_level("admin_api_server", "trace", expires=10) + time.sleep(1) + admin.set_log_level("admin_api_server", "error", expires=0) + + try: + mon.wait_until("Expiring log level for {admin_api_server}", + timeout_sec=15, + backoff_sec=1) + assert False, "Should not have seen message" + except ducktape.errors.TimeoutError: + pass + + level = admin.get_log_level("admin_api_server")[0]["level"] + assert level == "error", f"expected level=error, got {level=}" @cluster(num_nodes=3) def test_max_expiry(self):