From f3888315ade1a54d7ac9e49137796d6c8658746e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gell=C3=A9rt=20Peresztegi-Nagy?= Date: Thu, 7 Nov 2024 13:12:48 +0000 Subject: [PATCH 1/2] cloud_storage_client: accept templatized logger This is to allow passing in a `retry_chain_logger` which does not inherit from `ss::logger` but wraps it. --- src/v/cloud_storage_clients/BUILD | 1 + src/v/cloud_storage_clients/util.cc | 9 ++++++++- src/v/cloud_storage_clients/util.h | 3 ++- 3 files changed, 11 insertions(+), 2 deletions(-) diff --git a/src/v/cloud_storage_clients/BUILD b/src/v/cloud_storage_clients/BUILD index e825bc24ec3f6..7d8f321d24b70 100644 --- a/src/v/cloud_storage_clients/BUILD +++ b/src/v/cloud_storage_clients/BUILD @@ -53,6 +53,7 @@ redpanda_cc_library( "//src/v/utils:functional", "//src/v/utils:log_hist", "//src/v/utils:named_type", + "//src/v/utils:retry_chain_node", "//src/v/utils:stop_signal", "@boost//:beast", "@boost//:lexical_cast", diff --git a/src/v/cloud_storage_clients/util.cc b/src/v/cloud_storage_clients/util.cc index 5bd1eec7faef8..f3e6ccef3fe07 100644 --- a/src/v/cloud_storage_clients/util.cc +++ b/src/v/cloud_storage_clients/util.cc @@ -13,6 +13,7 @@ #include "base/vlog.h" #include "bytes/streambuf.h" #include "net/connection.h" +#include "utils/retry_chain_node.h" #include @@ -39,8 +40,9 @@ bool has_abort_or_gate_close_exception(const ss::nested_exception& ex) { || is_abort_or_gate_close_exception(ex.outer); } +template error_outcome handle_client_transport_error( - std::exception_ptr current_exception, ss::logger& logger) { + std::exception_ptr current_exception, Logger& logger) { auto outcome = error_outcome::retry; try { @@ -113,6 +115,11 @@ error_outcome handle_client_transport_error( return outcome; } +template error_outcome +handle_client_transport_error(std::exception_ptr, ss::logger&); +template error_outcome handle_client_transport_error( + std::exception_ptr, retry_chain_logger&); + ss::future drain_response_stream(http::client::response_stream_ref resp) { const auto transfer_encoding = resp->get_headers().find( diff --git a/src/v/cloud_storage_clients/util.h b/src/v/cloud_storage_clients/util.h index 312a3f015b170..ddc1e7c8c9199 100644 --- a/src/v/cloud_storage_clients/util.h +++ b/src/v/cloud_storage_clients/util.h @@ -24,8 +24,9 @@ namespace cloud_storage_clients::util { /// cloud provider (e.g. connection error). /// \param current_exception is the current exception thrown by the client /// \param logger is the logger to use +template error_outcome handle_client_transport_error( - std::exception_ptr current_exception, ss::logger& logger); + std::exception_ptr current_exception, Logger& logger); /// \brief: Drain the reponse stream pointed to by the 'resp' handle into an /// iobuf From ad14537c81bda0a8ae20f486b910920a4a8b0ba8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gell=C3=A9rt=20Peresztegi-Nagy?= Date: Thu, 7 Nov 2024 13:15:50 +0000 Subject: [PATCH 2/2] cloud_io: add missing error handling to `remote::download_object` The call to `drain_response_stream` may throw various transport related errors (see one example below of a Broken Pipe error observed in CI). These errors should be handled inside the `remote::download_object` method because the caller's expectation is that download-related errors are communicated via the `download_result` return type rather than through an exception. Some of these errors (like the broken pipe error below) could also be retried, whereas with the previous implementation they were not retried. These exceptions are often ignored by the caller and may be printed as "Exceptional future ignored" log lines, which cause CI failures and are less useful for debugging. The below is an example of one such ignored exceptional future in the remote partition finalizing background fibre: ``` INFO 2024-10-29 12:41:17,708 [shard 1:main] cloud_storage - [fiber474 kafka/fuzzy-operator-6356-dzxvff/4] - remote_partition.cc:1406 - Finalizing remote storage state... DEBUG 2024-10-29 12:41:17,723 [shard 1:main] cloud_io - [fiber819~0|1|19984ms] - remote.cc:430 - Receive OK response from "37836c6f-30b0-482f-bb4e-0f3dffdb5cbe/meta/kafka/fuzzy-operator-6356-dzxvff/1_3447/manifest.bin" WARN 2024-10-29 12:41:17,723 [shard 1:main] http - /37836c6f-30b0-482f-bb4e-0f3dffdb5cbe/meta/kafka/fuzzy-operator-6356-dzxvff/1_3447/manifest.bin - client.cc:414 - receive error std::__1::system_error (error generic:32, System error during SSL read: [error:FFFFFFFF80000020:system library::Broken pipe]: Broken pipe) WARN 2024-10-29 12:41:17,723 [shard 1:main] seastar - Exceptional future ignored: std::__1::system_error (error generic:32, System error during SSL read: [error:FFFFFFFF80000020:system library::Broken pipe]: Broken pipe), backtrace: 0xa73be23 0xa392e05 0x360a6b8 0x9352157 0x360a71a 0xa48cc6f 0xa49045c 0xa4e77ca 0xa402f3f /opt/redpanda/lib/libc.so.6+0x961b6 /opt/redpanda/lib/libc.so.6+0x11839b ``` --- src/v/cloud_io/remote.cc | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/src/v/cloud_io/remote.cc b/src/v/cloud_io/remote.cc index ea086debefb6e..571ac6ea5de24 100644 --- a/src/v/cloud_io/remote.cc +++ b/src/v/cloud_io/remote.cc @@ -428,12 +428,18 @@ remote::download_object(download_request download_request) { if (resp) { vlog(ctxlog.debug, "Receive OK response from {}", path); - auto buffer - = co_await cloud_storage_clients::util::drain_response_stream( - resp.value()); - download_request.payload.append_fragments(std::move(buffer)); - transfer_details.on_success(); - co_return download_result::success; + try { + auto buffer + = co_await cloud_storage_clients::util::drain_response_stream( + resp.value()); + download_request.payload.append_fragments(std::move(buffer)); + transfer_details.on_success(); + co_return download_result::success; + } catch (...) { + resp + = cloud_storage_clients::util::handle_client_transport_error( + std::current_exception(), ctxlog); + } } lease.client->shutdown();