From 6576e32d0cb952cec975cdd9825d44ecbc2f1579 Mon Sep 17 00:00:00 2001 From: Vlad Lazar Date: Tue, 17 Oct 2023 15:57:13 +0100 Subject: [PATCH] archival: don't stop the upload loop on sync fails Previously, we'd bail out of the upload loop if the initial sync call failed. The upload loop wouldn't restart until forced by re-setting the remote write topic config or restarting the node or changing leadership. This leads to the disk filling up since the collectable offset isn't advancing due to the lack of uploads. The fix is to swap the `co_return` with `continue` to allow for retries. I've also added some error logging for when the loops quit unexpectedly. --- src/v/archival/ntp_archiver_service.cc | 28 ++++++++++++++++++++++---- 1 file changed, 24 insertions(+), 4 deletions(-) diff --git a/src/v/archival/ntp_archiver_service.cc b/src/v/archival/ntp_archiver_service.cc index faf606e771811..9aab5a74d87e4 100644 --- a/src/v/archival/ntp_archiver_service.cc +++ b/src/v/archival/ntp_archiver_service.cc @@ -271,10 +271,30 @@ const cloud_storage::partition_manifest& ntp_archiver::manifest() const { ss::future<> ntp_archiver::start() { if (_parent.get_ntp_config().is_read_replica_mode_enabled()) { - ssx::spawn_with_gate( - _gate, [this] { return sync_manifest_until_abort(); }); + ssx::spawn_with_gate(_gate, [this] { + return sync_manifest_until_abort().then([this] { + if (!_as.abort_requested()) { + vlog( + _rtclog.error, + "Sync loop stopped without an abort being requested. " + "Please disable and re-enable " + "redpanda.remote.readreplica " + "the topic in order to restart it."); + } + }); + }); } else { - ssx::spawn_with_gate(_gate, [this] { return upload_until_abort(); }); + ssx::spawn_with_gate(_gate, [this] { + return upload_until_abort().then([this]() { + if (!_as.abort_requested()) { + vlog( + _rtclog.error, + "Upload loop stopped without an abort being requested. " + "Please disable and re-enable redpanda.remote.write " + "the topic in order to restart it."); + } + }); + }); } return ss::now(); @@ -341,7 +361,7 @@ ss::future<> ntp_archiver::upload_until_abort() { bool is_synced = co_await _parent.archival_meta_stm()->sync( sync_timeout); if (!is_synced) { - co_return; + continue; } vlog(_rtclog.debug, "upload loop synced in term {}", _start_term);