From f10017df143d6a30d24220e5e698467ea06ee87d Mon Sep 17 00:00:00 2001 From: Michal Maslanka Date: Mon, 20 Nov 2023 16:42:47 +0100 Subject: [PATCH] r/recovery_stm: stop recovery when follower was already updated Recovery and replicate stms are not synchronized. It may be the case when both of stms are active at the same time that the same batch is delivered to the follower twice. In general this batch duplication is harmless as Raft is not vulnerable for messages redelivery but it may cause unnecessary truncation and latency increase. Added a check validating expected log end offset right before sending recovery append entries request. This will prevent sending the same set of batches twice to the follower. Fixes: #14413 Signed-off-by: Michal Maslanka --- src/v/raft/recovery_stm.cc | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/src/v/raft/recovery_stm.cc b/src/v/raft/recovery_stm.cc index c1beda8ae2ce1..8f27c802bc2c3 100644 --- a/src/v/raft/recovery_stm.cc +++ b/src/v/raft/recovery_stm.cc @@ -557,6 +557,18 @@ ss::future<> recovery_stm::replicate( _stop_requested = true; return ss::now(); } + if (meta.value()->expected_log_end_offset >= _last_batch_offset) { + vlog( + _ctxlog.trace, + "follower expected log end offset is already updated, stopping " + "recovery. Expected log end offset: {}, recovery range last offset: " + "{}", + meta.value()->expected_log_end_offset, + _last_batch_offset); + + _stop_requested = true; + return ss::now(); + } /** * Update follower expected log end. It is equal to the last batch in a set * of batches read for this recovery round.