cluster: reject writes only if data disk is degraded

Health monitor tracks only the data disk now as the only use of that state is for rejecting writes. Cache disk state is irrelevant at cluster level. This was tested manually by creating a cluster with custom cache disk mountpoint and trying to produce to it. Before this commit, producing would have failed with a full cache disk. After this commit, producing fails only if the data disk is full.
redpanda-data · Dec 6, 2024 · b6b9a60 · b6b9a60
1 parent 105b76e
commit b6b9a60
Show file tree

Hide file tree

Showing 6 changed files with 24 additions and 22 deletions.
diff --git a/src/v/cluster/health_monitor_backend.cc b/src/v/cluster/health_monitor_backend.cc
@@ -295,7 +295,7 @@ health_monitor_backend::get_cluster_health(
 }
 
 ss::future<storage::disk_space_alert>
-health_monitor_backend::get_cluster_disk_health(
+health_monitor_backend::get_cluster_data_disk_health(
   force_refresh refresh, model::timeout_clock::time_point deadline) {
     auto ec = co_await maybe_refresh_cluster_health(refresh, deadline);
     if (ec) {
@@ -308,7 +308,7 @@ health_monitor_backend::get_cluster_disk_health(
         // operate, I guess.
         co_return storage::disk_space_alert::ok;
     }
-    co_return _reports_disk_health;
+    co_return _reports_data_disk_health;
 }
 
 ss::future<std::error_code>
@@ -444,8 +444,8 @@ ss::future<std::error_code> health_monitor_backend::collect_cluster_health() {
 
     auto old_reports = std::exchange(_reports, {});
 
-    // update nodes reports and cache cluster-level disk health
-    storage::disk_space_alert cluster_disk_health
+    // update nodes reports and cache cluster-level data disk health
+    storage::disk_space_alert cluster_data_disk_health
       = storage::disk_space_alert::ok;
     for (auto& r : reports) {
         if (r) {
@@ -471,14 +471,14 @@ ss::future<std::error_code> health_monitor_backend::collect_cluster_health() {
             for (auto& cb : _node_callbacks) {
                 cb.second(r.value(), old_report);
             }
-            cluster_disk_health = storage::max_severity(
-              r.value().local_state.get_disk_alert(), cluster_disk_health);
+            cluster_data_disk_health = storage::max_severity(
+              r.value().local_state.data_disk.alert, cluster_data_disk_health);
 
             _reports.emplace(
               id, ss::make_lw_shared<node_health_report>(std::move(r.value())));
         }
     }
-    _reports_disk_health = cluster_disk_health;
+    _reports_data_disk_health = cluster_data_disk_health;
 
     if (config::shard_local_cfg().enable_usage()) {
         vlog(clusterlog.info, "collecting cloud health statistics");

diff --git a/src/v/cluster/health_monitor_backend.h b/src/v/cluster/health_monitor_backend.h
@@ -65,7 +65,7 @@ class health_monitor_backend {
     ss::future<result<cluster_health_report>> get_cluster_health(
       cluster_report_filter, force_refresh, model::timeout_clock::time_point);
 
-    ss::future<storage::disk_space_alert> get_cluster_disk_health(
+    ss::future<storage::disk_space_alert> get_cluster_data_disk_health(
       force_refresh refresh, model::timeout_clock::time_point deadline);
 
     ss::future<result<node_health_report>> collect_current_node_health();
@@ -182,7 +182,7 @@ class health_monitor_backend {
 
     status_cache_t _status;
     report_cache_t _reports;
-    storage::disk_space_alert _reports_disk_health
+    storage::disk_space_alert _reports_data_disk_health
       = storage::disk_space_alert::ok;
     std::optional<size_t> _bytes_in_cloud_storage;
 

diff --git a/src/v/cluster/health_monitor_frontend.cc b/src/v/cluster/health_monitor_frontend.cc
@@ -60,8 +60,9 @@ health_monitor_frontend::get_cluster_health(
       });
 }
 
-storage::disk_space_alert health_monitor_frontend::get_cluster_disk_health() {
-    return _cluster_disk_health;
+storage::disk_space_alert
+health_monitor_frontend::get_cluster_data_disk_health() {
+    return _cluster_data_disk_health;
 }
 
 /**
@@ -100,23 +101,24 @@ health_monitor_frontend::get_cluster_health_overview(
 
 ss::future<> health_monitor_frontend::update_other_shards(
   const storage::disk_space_alert dsa) {
-    co_await container().invoke_on_others(
-      [dsa](health_monitor_frontend& fe) { fe._cluster_disk_health = dsa; });
+    co_await container().invoke_on_others([dsa](health_monitor_frontend& fe) {
+        fe._cluster_data_disk_health = dsa;
+    });
 }
 
 ss::future<> health_monitor_frontend::update_frontend_and_backend_cache() {
     auto deadline = model::time_from_now(default_timeout);
     auto disk_health = co_await dispatch_to_backend(
       [deadline](health_monitor_backend& be) {
-          return be.get_cluster_disk_health(force_refresh::no, deadline);
+          return be.get_cluster_data_disk_health(force_refresh::no, deadline);
       });
-    if (disk_health != _cluster_disk_health) {
+    if (disk_health != _cluster_data_disk_health) {
         vlog(
           clusterlog.debug,
-          "Update disk health cache {} -> {}",
-          _cluster_disk_health,
+          "Update data disk health cache {} -> {}",
+          _cluster_data_disk_health,
           disk_health);
-        _cluster_disk_health = disk_health;
+        _cluster_data_disk_health = disk_health;
         co_await update_other_shards(disk_health);
     }
 }

diff --git a/src/v/cluster/health_monitor_frontend.h b/src/v/cluster/health_monitor_frontend.h
@@ -57,7 +57,7 @@ class health_monitor_frontend
     ss::future<result<cluster_health_report>> get_cluster_health(
       cluster_report_filter, force_refresh, model::timeout_clock::time_point);
 
-    storage::disk_space_alert get_cluster_disk_health();
+    storage::disk_space_alert get_cluster_data_disk_health();
 
     // Collects or return cached version of current node health report.
     ss::future<result<node_health_report_ptr>> get_current_node_health();
@@ -101,7 +101,7 @@ class health_monitor_frontend
     config::binding<std::chrono::milliseconds> _alive_timeout;
 
     // Currently the worst / max of all nodes' disk space state
-    storage::disk_space_alert _cluster_disk_health{
+    storage::disk_space_alert _cluster_data_disk_health{
       storage::disk_space_alert::ok};
     ss::timer<ss::lowres_clock> _refresh_timer;
     ss::gate _refresh_gate;

diff --git a/src/v/cluster/metadata_cache.cc b/src/v/cluster/metadata_cache.cc
@@ -149,7 +149,7 @@ std::vector<model::node_id> metadata_cache::node_ids() const {
 }
 
 bool metadata_cache::should_reject_writes() const {
-    return _health_monitor.local().get_cluster_disk_health()
+    return _health_monitor.local().get_cluster_data_disk_health()
            == storage::disk_space_alert::degraded;
 }
 

diff --git a/tests/rptest/tests/full_disk_test.py b/tests/rptest/tests/full_disk_test.py
@@ -147,7 +147,7 @@ def check_health_monitor_frontend(disk_space_change: str):
             # Looking for a log statement about a change in disk space.
             # This is a check for the health monitor frontend because
             # that structure logs disk space alerts.
-            pattern = f"Update disk health cache {disk_space_change}"
+            pattern = f"Update data disk health cache {disk_space_change}"
             wait_until(
                 lambda: self.redpanda.search_log_any(pattern),
                 timeout_sec=5,