Merge pull request #13063 from vbotbuildovich/backport-pr-13013-v23.2…

….x-309 [v23.2.x] cloud_storage: Force exhaustive trim when fast trim fails
redpanda-data · Aug 30, 2023 · 7422f86 · 7422f86
2 parents fa13038 + fb97c0c
commit 7422f86
Show file tree

Hide file tree

Showing 4 changed files with 102 additions and 27 deletions.
diff --git a/src/v/cloud_storage/cache_service.cc b/src/v/cloud_storage/cache_service.cc
@@ -351,7 +351,8 @@ ss::future<> cache::trim(
     vlog(
       cst_log.debug,
       "trim: set target_size {}/{}, size {}/{}, walked size {} (max {}/{}), "
-      " reserved {}/{}, pending {}/{})",
+      " reserved {}/{}, pending {}/{}), candidates for deletion: {}, filtered "
+      "out: {}",
       target_size,
       target_objects,
       _current_cache_size,
@@ -362,7 +363,9 @@ ss::future<> cache::trim(
       _reserved_cache_size,
       _reserved_cache_objects,
       _reservations_pending,
-      _reservations_pending_objects);
+      _reservations_pending_objects,
+      candidates_for_deletion.size(),
+      filtered_out_files);
 
     if (
       _current_cache_size + _reserved_cache_size < target_size
@@ -453,9 +456,24 @@ ss::future<> cache::trim(
     // cache.
     size_to_delete = std::min(
       walked_cache_size - fast_result.deleted_size, size_to_delete);
-    objects_to_delete = std::min(
-      candidates_for_deletion.size() - fast_result.deleted_count,
-      objects_to_delete);
+
+    // If we were not able to delete enough files and there are some filtered
+    // out files, force an exhaustive trim. This ensures that if the cache is
+    // dominated by filtered out files, we do not skip trimming them by reducing
+    // the objects_to_delete counter next.
+    bool force_exhaustive_trim = fast_result.deleted_count < objects_to_delete
+                                 && filtered_out_files > 0;
+
+    // In the situation where all files in cache are filtered out,
+    // candidates_for_deletion equals 1 (due to the accesstime tracker file) and
+    // the following reduction to objects_to_delete ends up setting
+    // this counter to 1, causing the exhaustive trim to be skipped. The check
+    // force_exhaustive_trim avoids this.
+    if (!force_exhaustive_trim) {
+        objects_to_delete = std::min(
+          candidates_for_deletion.size() - fast_result.deleted_count,
+          objects_to_delete);
+    }
 
     if (
       size_to_delete > undeletable_bytes

diff --git a/src/v/cloud_storage/tests/cache_test.cc b/src/v/cloud_storage/tests/cache_test.cc
@@ -26,6 +26,7 @@
 #include <boost/test/unit_test.hpp>
 
 #include <chrono>
+#include <fstream>
 #include <stdexcept>
 
 using namespace cloud_storage;
@@ -466,3 +467,36 @@ FIXTURE_TEST(test_clean_up_on_start_empty, cache_test_fixture) {
 
     BOOST_CHECK(ss::file_exists(CACHE_DIR.native()).get());
 }
+
+/**
+ * Given a cache dir populated with files which are filtered out by fast trim,
+ * validate that a failing fast trim should be followed up by an exhaustive trim
+ * and clean up the required object count.
+ */
+FIXTURE_TEST(test_exhaustive_trim_runs_after_fast_trim, cache_test_fixture) {
+    std::vector<std::filesystem::path> indices;
+    const auto count_indices = 5;
+    indices.reserve(count_indices);
+
+    for (auto i = 0; i < count_indices; ++i) {
+        indices.emplace_back(CACHE_DIR / fmt::format("{}.index", i, i, i));
+        std::ofstream f{indices.back()};
+        f.flush();
+    }
+
+    BOOST_REQUIRE(
+      std::all_of(indices.cbegin(), indices.cend(), [](const auto& path) {
+          return std::filesystem::exists(path);
+      }));
+
+    // Make cache service scan the disk for objects
+    clean_up_at_start().get();
+
+    // Only allow the access time tracker to remain on disk.
+    trim_cache(std::nullopt, 1);
+
+    BOOST_REQUIRE(
+      std::all_of(indices.cbegin(), indices.cend(), [](const auto& path) {
+          return !std::filesystem::exists(path);
+      }));
+}
diff --git a/src/v/cloud_storage/tests/cache_test_fixture.h b/src/v/cloud_storage/tests/cache_test_fixture.h
@@ -107,10 +107,16 @@ class cache_test_fixture {
         return sharded_cache.local().clean_up_at_start();
     }
 
-    void trim_cache() {
+    void trim_cache(
+      std::optional<uint64_t> size_limit_override = std::nullopt,
+      std::optional<size_t> object_limit_override = std::nullopt) {
         sharded_cache
           .invoke_on(
-            ss::shard_id{0}, [](cloud_storage::cache& c) { return c.trim(); })
+            ss::shard_id{0},
+            [&size_limit_override,
+             &object_limit_override](cloud_storage::cache& c) {
+                return c.trim(size_limit_override, object_limit_override);
+            })
           .get();
     }
 };

diff --git a/tests/rptest/scale_tests/tiered_storage_cache_stress_test.py b/tests/rptest/scale_tests/tiered_storage_cache_stress_test.py
@@ -359,18 +359,7 @@ def tiny_cache_test(self):
                                         cache_size,
                                         max_objects=None)
 
-    @cluster(num_nodes=4)
-    def garbage_objects_test(self):
-        """
-        Verify that if there are a large number of small files which do not pair
-        with data chunks, we still trim them when cache space is low.
-
-        This test is a reproducer for issues where the cache needs trimming but there
-        are not data objects present to cue the fast trim process to delete indices etc,
-        and we must fall back to exhaustive trim, such as:
-        https://github.com/redpanda-data/redpanda/issues/11835
-        """
-
+    def run_test_with_cache_prefilled(self, cache_prefill_command: str):
         segment_size = 128 * 1024 * 1024
         msg_size = 16384
         data_size = segment_size * 10
@@ -382,15 +371,12 @@ def garbage_objects_test(self):
         for n in self.redpanda.nodes:
             self.redpanda.clean_node(n)
 
-        # Pre-populate caches with garbage files.
-        garbage_count = 100
+        # Pre-populate caches with files.
+        prefill_count = 100
         for node in self.redpanda.nodes:
-            node.account.ssh_output(
-                f"mkdir -p {self.redpanda.cache_dir} ; for n in `seq 1 {garbage_count}`; do "
-                f"dd if=/dev/urandom bs=1k count=4 of={self.redpanda.cache_dir}/garbage_$n.bin ; done",
-                combine_stderr=False)
+            node.account.ssh(cache_prefill_command.format(prefill_count))
 
-        cache_object_limit = garbage_count // 2
+        cache_object_limit = prefill_count // 2
 
         # Set cache size to 50 objects
         si_settings = SISettings(
@@ -410,7 +396,7 @@ def garbage_objects_test(self):
         for node in self.redpanda.nodes:
             usage = admin.get_local_storage_usage(node)
             assert usage[
-                'cloud_storage_cache_objects'] >= garbage_count, f"Node {node.name} has unexpectedly few objects {usage['cloud_storage_cache_objects']} < {garbage_count}"
+                'cloud_storage_cache_objects'] >= prefill_count, f"Node {node.name} has unexpectedly few objects {usage['cloud_storage_cache_objects']} < {prefill_count}"
 
         # Inject data
         self._create_topic(topic_name, 1, segment_size)
@@ -437,3 +423,34 @@ def garbage_objects_test(self):
         usage = admin.get_local_storage_usage(leader_node)
         assert usage[
             'cloud_storage_cache_objects'] <= cache_object_limit, f"Node {leader_node.name} has unexpectedly many objects {usage['cloud_storage_cache_objects']} > {cache_object_limit}"
+
+    @cluster(num_nodes=4)
+    def garbage_objects_test(self):
+        """
+        Verify that if there are a large number of small files which do not pair
+        with data chunks, we still trim them when cache space is low.
+
+        This test is a reproducer for issues where the cache needs trimming but there
+        are no data objects present to cue the fast trim process to delete indices etc,
+        and we must fall back to exhaustive trim, such as:
+        https://github.com/redpanda-data/redpanda/issues/11835
+        """
+
+        self.run_test_with_cache_prefilled(
+            f"mkdir -p {self.redpanda.cache_dir} ; "
+            "for n in `seq 1 {}`; do "
+            f"dd if=/dev/urandom bs=1k count=4 of={self.redpanda.cache_dir}/garbage_$n.bin ; done"
+        )
+
+    @cluster(num_nodes=4)
+    def test_indices_dominate_cache(self):
+        """
+        Ensures that if the cache is filled with index and tx objects alone,
+        trimming still works.
+        """
+        self.run_test_with_cache_prefilled(
+            f"mkdir -pv {self.redpanda.cache_dir}; "
+            "for n in `seq 1 {}`; do "
+            f"touch {self.redpanda.cache_dir}/garbage_$n.index && "
+            f"touch {self.redpanda.cache_dir}/garbage_$n.tx; "
+            "done")