From feeec639f2d52379d33100c505682113deb03817 Mon Sep 17 00:00:00 2001
From: Brandon Allard <brandon@redpanda.com>
Date: Tue, 2 Jul 2024 23:23:02 -0400
Subject: [PATCH 1/3] cluster: cache kafka offset of last applied record in
 log_eviction_stm

This allows the log_eviction_stm to know what the current kafka start
offset override is even if it doesn't reside in the local log. However,
since this cached value is not snapshotted by the stm it may not be
recovered when repanda restarts. In this case one would need to fallback
on the archival stm to find the kafka start offset override.
---
 src/v/cluster/log_eviction_stm.cc | 36 ++++++++++++++++++++++++++++---
 src/v/cluster/log_eviction_stm.h  | 21 +++++++++++-------
 src/v/cluster/partition.cc        | 25 ++++++---------------
 3 files changed, 53 insertions(+), 29 deletions(-)

diff --git a/src/v/cluster/log_eviction_stm.cc b/src/v/cluster/log_eviction_stm.cc
index 7f08c59552a3..f871844d0cce 100644
--- a/src/v/cluster/log_eviction_stm.cc
+++ b/src/v/cluster/log_eviction_stm.cc
@@ -217,8 +217,37 @@ log_eviction_stm::do_write_raft_snapshot(model::offset truncation_point) {
       raft::write_snapshot_cfg(truncation_point, std::move(snapshot_data)));
 }
 
-ss::future<result<model::offset, std::error_code>>
-log_eviction_stm::sync_start_offset_override(
+kafka::offset log_eviction_stm::kafka_start_offset_override() {
+    if (_cached_kafka_start_offset_override != kafka::offset{}) {
+        return _cached_kafka_start_offset_override;
+    }
+
+    // Since the STM doesn't snapshot `_cached_kafka_start_override` its
+    // possible for it to be lost during restarts. Therefore the raft offset
+    // which is snapshotted will be translated if possible.
+    if (_delete_records_eviction_offset == model::offset{}) {
+        return kafka::offset{};
+    }
+
+    auto raft_start_offset_override = model::next_offset(
+      _delete_records_eviction_offset);
+
+    // This handles an edge case where the stm will not record any raft
+    // offsets that do not land in local storage. Hence returning
+    // `kafka::offset{}` indicates to the caller that the archival stm
+    // should be queried for the offset instead.
+    if (raft_start_offset_override <= _raft->start_offset()) {
+        return kafka::offset{};
+    }
+
+    _cached_kafka_start_offset_override = model::offset_cast(
+      _raft->log()->from_log_offset(raft_start_offset_override));
+
+    return _cached_kafka_start_offset_override;
+}
+
+ss::future<result<kafka::offset, std::error_code>>
+log_eviction_stm::sync_kafka_start_offset_override(
   model::timeout_clock::duration timeout) {
     /// Call this method to ensure followers have processed up until the
     /// most recent known version of the special batch. This is particularly
@@ -232,7 +261,7 @@ log_eviction_stm::sync_start_offset_override(
             co_return errc::timeout;
         }
     }
-    co_return start_offset_override();
+    co_return kafka_start_offset_override();
 }
 
 model::offset log_eviction_stm::effective_start_offset() const {
@@ -369,6 +398,7 @@ ss::future<> log_eviction_stm::apply(const model::record_batch& batch) {
     }
     const auto record = serde::from_iobuf<prefix_truncate_record>(
       batch.copy_records().begin()->release_value());
+    _cached_kafka_start_offset_override = record.kafka_start_offset;
     if (record.rp_start_offset == model::offset{}) {
         // This may happen if the requested offset was not in the local log at
         // time of replicating. We still need to have replicated it though so
diff --git a/src/v/cluster/log_eviction_stm.h b/src/v/cluster/log_eviction_stm.h
index a992a0675200..0338bf240baf 100644
--- a/src/v/cluster/log_eviction_stm.h
+++ b/src/v/cluster/log_eviction_stm.h
@@ -94,15 +94,17 @@ class log_eviction_stm
     /// This only returns the start override, if one exists. It does not take
     /// into account local storage, and may not even point to an offset that
     /// exists in local storage (e.g. if we have locally truncated).
-    ss::future<offset_result>
-    sync_start_offset_override(model::timeout_clock::duration timeout);
+    ///
+    /// If `kafka::offset{}` is returned and archival storage is enabled for the
+    /// given ntp then the caller should fall back on the archival stm to check
+    /// if a start offset override exists and if so what its value is.
+    ss::future<result<kafka::offset, std::error_code>>
+    sync_kafka_start_offset_override(model::timeout_clock::duration timeout);
 
-    model::offset start_offset_override() const {
-        if (_delete_records_eviction_offset == model::offset{}) {
-            return model::offset{};
-        }
-        return model::next_offset(_delete_records_eviction_offset);
-    }
+    /// If `kafka::offset{}` is returned and archival storage is enabled for the
+    /// given ntp then the caller should fall back on the archival stm to check
+    /// if a start offset override exists and if so what its value is.
+    kafka::offset kafka_start_offset_override();
 
     ss::future<iobuf> take_snapshot(model::offset) final { co_return iobuf{}; }
 
@@ -144,6 +146,9 @@ class log_eviction_stm
 
     // Should be signaled every time either of the above offsets are updated.
     ss::condition_variable _has_pending_truncation;
+
+    // Kafka offset of the last `prefix_truncate_record` applied to this stm.
+    kafka::offset _cached_kafka_start_offset_override;
 };
 
 class log_eviction_stm_factory : public state_machine_factory {
diff --git a/src/v/cluster/partition.cc b/src/v/cluster/partition.cc
index ec8d998b09e8..6ac51416b74d 100644
--- a/src/v/cluster/partition.cc
+++ b/src/v/cluster/partition.cc
@@ -1214,22 +1214,14 @@ ss::future<result<model::offset, std::error_code>>
 partition::sync_kafka_start_offset_override(
   model::timeout_clock::duration timeout) {
     if (_log_eviction_stm && !is_read_replica_mode_enabled()) {
-        auto offset_res
-          = co_await _log_eviction_stm->sync_start_offset_override(timeout);
+        auto offset_res = co_await _log_eviction_stm
+                            ->sync_kafka_start_offset_override(timeout);
         if (offset_res.has_failure()) {
             co_return offset_res.as_failure();
         }
-        // The eviction STM only keeps track of DeleteRecords truncations
-        // as Raft offsets. Translate if possible.
-        if (
-          offset_res.value() != model::offset{}
-          && _raft->start_offset() < offset_res.value()) {
-            auto start_kafka_offset = log()->from_log_offset(
-              offset_res.value());
-            co_return start_kafka_offset;
+        if (offset_res.value() != kafka::offset{}) {
+            co_return kafka::offset_cast(offset_res.value());
         }
-        // If a start override is no longer in the offset translator state,
-        // it may have been uploaded and persisted in the manifest.
     }
     if (_archival_meta_stm) {
         auto term = _raft->term();
@@ -1304,13 +1296,10 @@ partition::archival_meta_stm() const {
 
 std::optional<model::offset> partition::kafka_start_offset_override() const {
     if (_log_eviction_stm && !is_read_replica_mode_enabled()) {
-        auto o = _log_eviction_stm->start_offset_override();
-        if (o != model::offset{} && _raft->start_offset() < o) {
-            auto start_kafka_offset = log()->from_log_offset(o);
-            return start_kafka_offset;
+        auto o = _log_eviction_stm->kafka_start_offset_override();
+        if (o != kafka::offset{}) {
+            return kafka::offset_cast(o);
         }
-        // If a start override is no longer in the offset translator state,
-        // it may have been uploaded and persisted in the manifest.
     }
     if (_archival_meta_stm) {
         auto o

From 3ba042d4ee19e1329c8faedd336d65d483c3b462 Mon Sep 17 00:00:00 2001
From: Brandon Allard <brandon@redpanda.com>
Date: Sat, 8 Jun 2024 23:23:08 -0400
Subject: [PATCH 2/3] cluster: avoid syncing archival_meta_stm in
 sync_kafka_start_offset_override

---
 src/v/cluster/partition.cc | 52 ++++++++++++++++++++++++++++++++------
 src/v/cluster/partition.h  |  4 +++
 2 files changed, 48 insertions(+), 8 deletions(-)

diff --git a/src/v/cluster/partition.cc b/src/v/cluster/partition.cc
index 6ac51416b74d..e6a105b94162 100644
--- a/src/v/cluster/partition.cc
+++ b/src/v/cluster/partition.cc
@@ -1213,7 +1213,22 @@ partition::get_cloud_storage_manifest_view() {
 ss::future<result<model::offset, std::error_code>>
 partition::sync_kafka_start_offset_override(
   model::timeout_clock::duration timeout) {
-    if (_log_eviction_stm && !is_read_replica_mode_enabled()) {
+    if (is_read_replica_mode_enabled()) {
+        auto term = _raft->term();
+        if (!co_await _archival_meta_stm->sync(timeout)) {
+            if (term != _raft->term()) {
+                co_return errc::not_leader;
+            } else {
+                co_return errc::timeout;
+            }
+        }
+        auto start_kafka_offset
+          = _archival_meta_stm->manifest().get_start_kafka_offset_override();
+
+        co_return kafka::offset_cast(start_kafka_offset);
+    }
+
+    if (_log_eviction_stm) {
         auto offset_res = co_await _log_eviction_stm
                             ->sync_kafka_start_offset_override(timeout);
         if (offset_res.has_failure()) {
@@ -1223,7 +1238,29 @@ partition::sync_kafka_start_offset_override(
             co_return kafka::offset_cast(offset_res.value());
         }
     }
-    if (_archival_meta_stm) {
+
+    if (!_archival_meta_stm) {
+        co_return model::offset{};
+    }
+
+    // There are a few cases in which the log_eviction_stm will return a kafka
+    // offset of `kafka::offset{}` for the start offset override.
+    // - The topic was remotely recovered.
+    // - A start offset override was never set.
+    // - The broker has restarted and the log_eviction_stm couldn't recover the
+    //   kafka offset for the start offset override.
+    //
+    // In all cases we'll need to fall back to the archival stm to figure out if
+    // a start offset override exists, and if so, what it is.
+    //
+    // For this we'll sync the archival stm a single time to ensure we have the
+    // most up-to-date manifest. From that point onwards the offset
+    // `_archival_meta_stm->manifest().get_start_kafka_offset_override()` will
+    // be correct without having to sync again. This is since the offset will
+    // not change until another offset override has been applied to the log
+    // eviction stm. And at that point the log eviction stm will be able to give
+    // us the correct offset override.
+    if (!_has_synced_archival_for_start_override) [[unlikely]] {
         auto term = _raft->term();
         if (!co_await _archival_meta_stm->sync(timeout)) {
             if (term != _raft->term()) {
@@ -1232,13 +1269,12 @@ partition::sync_kafka_start_offset_override(
                 co_return errc::timeout;
             }
         }
-        auto start_kafka_offset
-          = _archival_meta_stm->manifest().get_start_kafka_offset_override();
-        if (start_kafka_offset != kafka::offset{}) {
-            co_return kafka::offset_cast(start_kafka_offset);
-        }
+        _has_synced_archival_for_start_override = true;
     }
-    co_return model::offset{};
+
+    auto start_kafka_offset
+      = _archival_meta_stm->manifest().get_start_kafka_offset_override();
+    co_return kafka::offset_cast(start_kafka_offset);
 }
 
 model::offset partition::last_stable_offset() const {
diff --git a/src/v/cluster/partition.h b/src/v/cluster/partition.h
index 0b503a136bb9..c12f1fbe4b2b 100644
--- a/src/v/cluster/partition.h
+++ b/src/v/cluster/partition.h
@@ -377,6 +377,10 @@ class partition : public ss::enable_lw_shared_from_this<partition> {
 
     ss::sharded<archival::upload_housekeeping_service>& _upload_housekeeping;
 
+    // Used in `sync_kafka_start_offset_override` to avoid having to re-sync the
+    // `archival_meta_stm`.
+    bool _has_synced_archival_for_start_override{false};
+
     friend std::ostream& operator<<(std::ostream& o, const partition& x);
 };
 } // namespace cluster

From e96451ad86d860097a7f1928ac3445f411459318 Mon Sep 17 00:00:00 2001
From: Brandon Allard <brandon@redpanda.com>
Date: Sun, 23 Jun 2024 16:22:27 -0400
Subject: [PATCH 3/3] treewide: remove coroutines in validate_fetch_offset
 callees

The most common path for validate_fetch_offset results in a number of
short lived coroutines. In these cases the allocation/deallocation for
the coroutine's frame ends up dominating the runtime for the function.

This commit removes the coroutines in favor for then chains which can
avoid the allocation if the task quota hasn't been met.
---
 src/v/cluster/log_eviction_stm.cc          |  19 ++--
 src/v/kafka/server/replicated_partition.cc | 102 +++++++++++----------
 src/v/raft/persisted_stm.cc                |  22 +++--
 3 files changed, 77 insertions(+), 66 deletions(-)

diff --git a/src/v/cluster/log_eviction_stm.cc b/src/v/cluster/log_eviction_stm.cc
index f871844d0cce..59c5b1257d08 100644
--- a/src/v/cluster/log_eviction_stm.cc
+++ b/src/v/cluster/log_eviction_stm.cc
@@ -254,14 +254,17 @@ log_eviction_stm::sync_kafka_start_offset_override(
     /// useful to know if the start offset is up to date in the case
     /// leadership has recently changed for example.
     auto term = _raft->term();
-    if (!co_await sync(timeout)) {
-        if (term != _raft->term()) {
-            co_return errc::not_leader;
-        } else {
-            co_return errc::timeout;
-        }
-    }
-    co_return kafka_start_offset_override();
+    return sync(timeout).then(
+      [this, term](bool success) -> result<kafka::offset, std::error_code> {
+          if (!success) {
+              if (term != _raft->term()) {
+                  return errc::not_leader;
+              } else {
+                  return errc::timeout;
+              }
+          }
+          return kafka_start_offset_override();
+      });
 }
 
 model::offset log_eviction_stm::effective_start_offset() const {
diff --git a/src/v/kafka/server/replicated_partition.cc b/src/v/kafka/server/replicated_partition.cc
index 77536cba6d30..0840eeb6cf6b 100644
--- a/src/v/kafka/server/replicated_partition.cc
+++ b/src/v/kafka/server/replicated_partition.cc
@@ -45,30 +45,32 @@ const model::ntp& replicated_partition::ntp() const {
 ss::future<result<model::offset, error_code>>
 replicated_partition::sync_effective_start(
   model::timeout_clock::duration timeout) {
-    auto synced_start_offset_override
-      = co_await _partition->sync_kafka_start_offset_override(timeout);
-    if (synced_start_offset_override.has_failure()) {
-        auto err = synced_start_offset_override.error();
-        auto error_code = error_code::unknown_server_error;
-        if (err.category() == cluster::error_category()) {
-            switch (cluster::errc(err.value())) {
-                /**
-                 * In the case of timeout and shutting down errors return
-                 * not_leader_for_partition error to force clients retry.
-                 */
-            case cluster::errc::shutting_down:
-            case cluster::errc::not_leader:
-            case cluster::errc::timeout:
-                error_code = error_code::not_leader_for_partition;
-                break;
-            default:
-                error_code = error_code::unknown_server_error;
-            }
-        }
-        co_return error_code;
-    }
-    co_return kafka_start_offset_with_override(
-      synced_start_offset_override.value());
+    return _partition->sync_kafka_start_offset_override(timeout).then(
+      [this](auto synced_start_offset_override)
+        -> result<model::offset, error_code> {
+          if (synced_start_offset_override.has_failure()) {
+              auto err = synced_start_offset_override.error();
+              auto error_code = error_code::unknown_server_error;
+              if (err.category() == cluster::error_category()) {
+                  switch (cluster::errc(err.value())) {
+                      /**
+                       * In the case of timeout and shutting down errors return
+                       * not_leader_for_partition error to force clients retry.
+                       */
+                  case cluster::errc::shutting_down:
+                  case cluster::errc::not_leader:
+                  case cluster::errc::timeout:
+                      error_code = error_code::not_leader_for_partition;
+                      break;
+                  default:
+                      error_code = error_code::unknown_server_error;
+                  }
+              }
+              return error_code;
+          }
+          return kafka_start_offset_with_override(
+            synced_start_offset_override.value());
+      });
 }
 
 model::offset replicated_partition::start_offset() const {
@@ -575,36 +577,40 @@ ss::future<error_code> replicated_partition::validate_fetch_offset(
               ec);
         }
 
-        co_return ec;
+        return ss::make_ready_future<error_code>(ec);
     }
 
     // Grab the up to date start offset
     auto timeout = deadline - model::timeout_clock::now();
-    auto start_offset = co_await sync_effective_start(timeout);
-    if (!start_offset) {
-        vlog(
-          klog.warn,
-          "ntp {}: error obtaining latest start offset - {}",
-          ntp(),
-          start_offset.error());
-        co_return start_offset.error();
-    }
+    return sync_effective_start(timeout).then(
+      [this, fetch_offset](auto start_offset) {
+          if (!start_offset) {
+              vlog(
+                klog.warn,
+                "ntp {}: error obtaining latest start offset - {}",
+                ntp(),
+                start_offset.error());
+              return start_offset.error();
+          }
 
-    if (
-      fetch_offset < start_offset.value() || fetch_offset > log_end_offset()) {
-        vlog(
-          klog.warn,
-          "ntp {}: fetch offset_out_of_range on leader, requested: {}, "
-          "partition start offset: {}, high watermark: {}, log end offset: {}",
-          ntp(),
-          fetch_offset,
-          start_offset.value(),
-          high_watermark(),
-          log_end_offset());
-        co_return error_code::offset_out_of_range;
-    }
+          if (
+            fetch_offset < start_offset.value()
+            || fetch_offset > log_end_offset()) {
+              vlog(
+                klog.warn,
+                "ntp {}: fetch offset_out_of_range on leader, requested: {}, "
+                "partition start offset: {}, high watermark: {}, log end "
+                "offset: {}",
+                ntp(),
+                fetch_offset,
+                start_offset.value(),
+                high_watermark(),
+                log_end_offset());
+              return error_code::offset_out_of_range;
+          }
 
-    co_return error_code::none;
+          return error_code::none;
+      });
 }
 
 result<partition_info> replicated_partition::get_partition_info() const {
diff --git a/src/v/raft/persisted_stm.cc b/src/v/raft/persisted_stm.cc
index f7a62f9ceeea..d15ad1428692 100644
--- a/src/v/raft/persisted_stm.cc
+++ b/src/v/raft/persisted_stm.cc
@@ -492,16 +492,16 @@ ss::future<bool>
 persisted_stm<T>::sync(model::timeout_clock::duration timeout) {
     auto term = _raft->term();
     if (!_raft->is_leader()) {
-        co_return false;
+        return ss::make_ready_future<bool>(false);
     }
     if (_insync_term == term) {
-        co_return true;
+        return ss::make_ready_future<bool>(true);
     }
     if (_is_catching_up) {
         auto deadline = model::timeout_clock::now() + timeout;
         auto sync_waiter = ss::make_lw_shared<expiring_promise<bool>>();
         _sync_waiters.push_back(sync_waiter);
-        co_return co_await sync_waiter->get_future_with_timeout(
+        return sync_waiter->get_future_with_timeout(
           deadline, [] { return false; });
     }
     _is_catching_up = true;
@@ -527,14 +527,16 @@ persisted_stm<T>::sync(model::timeout_clock::duration timeout) {
         // of the term yet.
         sync_offset = log_offsets.dirty_offset;
     }
-    auto is_synced = co_await do_sync(timeout, sync_offset, term);
 
-    _is_catching_up = false;
-    for (auto& sync_waiter : _sync_waiters) {
-        sync_waiter->set_value(is_synced);
-    }
-    _sync_waiters.clear();
-    co_return is_synced;
+    return do_sync(timeout, sync_offset, term).then([this](bool is_synced) {
+        _is_catching_up = false;
+        for (auto& sync_waiter : _sync_waiters) {
+            sync_waiter->set_value(is_synced);
+        }
+        _sync_waiters.clear();
+
+        return is_synced;
+    });
 }
 
 template<supported_stm_snapshot T>