From 769921e34ba8aae84e3d6fee65954f790a70a25b Mon Sep 17 00:00:00 2001 From: Michael Boquard Date: Tue, 23 Jul 2024 16:04:58 -0400 Subject: [PATCH 1/2] k/client: Retry connection on DNS error During a rolling upgrade in cloud, it was observed that RP's kafka client would attempt to connect to the 'reserve' node after it was decomissioned. This was because the error code (C-Ares ENOTFOUND) was not treated as a retriable error. This change checks for the above error code when attempting to connect to a broker and if it is encountered, treats it as a retriable error. Signed-off-by: Michael Boquard (cherry picked from commit d79ab902239e382646bf8a81ccb14af56b71d524) --- src/v/kafka/client/broker.cc | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/src/v/kafka/client/broker.cc b/src/v/kafka/client/broker.cc index 9f8542d0b25de..d896adfa195aa 100644 --- a/src/v/kafka/client/broker.cc +++ b/src/v/kafka/client/broker.cc @@ -17,6 +17,27 @@ #include "rpc/rpc_utils.h" #include +#include + +#include + +namespace { +bool is_dns_failure_error(const std::system_error& e) { + if (e.code().category() == ss::net::dns::error_category()) { + switch (e.code().value()) { + case ARES_ENOTFOUND: + case ARES_ENODATA: + case ARES_ETIMEOUT: + case ARES_ECONNREFUSED: + return true; + default: + return false; + } + } + + return false; +} +} // namespace namespace kafka::client { @@ -50,7 +71,7 @@ ss::future make_broker( }); }) .handle_exception_type([node_id](const std::system_error& ex) { - if (net::is_reconnect_error(ex)) { + if (net::is_reconnect_error(ex) || is_dns_failure_error(ex)) { return ss::make_exception_future( broker_error(node_id, error_code::network_exception)); } From f8fbf36a6b175f1056a4796497908f6c7fcfe179 Mon Sep 17 00:00:00 2001 From: Michael Boquard Date: Thu, 25 Jul 2024 08:20:29 -0400 Subject: [PATCH 2/2] net: Check error category in is_reconnect_error Signed-off-by: Michael Boquard (cherry picked from commit da09267f75100b3f00d44df8fd9f130818982886) --- src/v/net/connection.cc | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/src/v/net/connection.cc b/src/v/net/connection.cc index 8879be1759944..92967f351baf7 100644 --- a/src/v/net/connection.cc +++ b/src/v/net/connection.cc @@ -17,6 +17,8 @@ #include #include +#include + namespace net { /** @@ -40,7 +42,9 @@ bool is_reconnect_error(const std::system_error& e) { if (e.code().category() == ss::tls::error_category()) { return absl::c_any_of( ss_tls_reconnect_errors, [v](int ec) { return v == ec; }); - } else { + } else if ( + e.code().category() == std::system_category() + || e.code().category() == std::generic_category()) { switch (v) { case ECONNREFUSED: case ENETUNREACH: @@ -58,6 +62,9 @@ bool is_reconnect_error(const std::system_error& e) { default: return false; } + } else { + // We don't know what the error category is at this point + return false; } __builtin_unreachable(); }