oxidecomputer · cbiffle · Feb 14, 2024 · hawkw · Feb 14, 2024 · cbiffle
diff --git a/task/host-sp-comms/src/main.rs b/task/host-sp-comms/src/main.rs
@@ -258,6 +258,13 @@ struct ServerImpl {
     reboot_state: Option<RebootState>,
     host_kv_storage: HostKeyValueStorage,
     hf_mux_state: Option<HfMuxState>,
+
+    /// Set when we receive a Boot Failed message from the host. This flag
+    /// changes the behavior in response to the next Request Reboot message. It
+    /// is cleared at Request Reboot, *or* at any of the messages sent during
+    /// host boot, to try and keep our state from getting out of sync for
+    /// whatever reason.
+    host_boot_just_failed: bool,
 }
 
 impl ServerImpl {
@@ -290,6 +297,8 @@ impl ServerImpl {
             reboot_state: None,
             host_kv_storage: HostKeyValueStorage::claim_static_resources(),
             hf_mux_state: None,
+
+            host_boot_just_failed: false,
         }
     }
 
@@ -701,13 +710,24 @@ impl ServerImpl {
 
         // We defer any actions until after we've serialized our response to
         // avoid borrow checker issues with calling methods on `self`.
+        //
+        // NOTE: setting any action will also clear the `host_boot_just_failed`
+        // flag, to avoid keeping incorrect state around. Since we have no
+        // explicit way of tracking the host boot software's lifecycle, this is
+        // a compromise.
         let mut action = None;
+
         let response = match request {
             HostToSp::_Unused => {
                 Some(SpToHost::DecodeFailure(DecodeFailureReason::Deserialize))
             }
             HostToSp::RequestReboot => {
-                action = Some(Action::RebootHost);
+                if core::mem::take(&mut self.host_boot_just_failed) {
+                    // Do _not_ reboot. Reinterpret this as a poweroff request.
+                    action = Some(Action::PowerOffHost);
+                } else {
+                    action = Some(Action::RebootHost);
+                }
                 None
             }
             HostToSp::RequestPowerOff => {
@@ -777,6 +797,11 @@ impl ServerImpl {
                 for b in &mut self.host_kv_storage.last_boot_fail[n..] {
                     *b = 0;
                 }
+
+                // Record that this failure happened, which will change the
+                // behavior of the reboot request we expect to follow shortly.
+                self.host_boot_just_failed = true;
+
                 Some(SpToHost::Ack)
             }
             HostToSp::HostPanic { .. } => {
@@ -793,6 +818,10 @@ impl ServerImpl {
                 for b in &mut self.host_kv_storage.last_panic[n..] {
                     *b = 0;
                 }
+
+                // Neither set nor clear the host_boot_just_failed flag, for
+                // this message often immediately follows a boot failure.
+
                 Some(SpToHost::Ack)
             }
             HostToSp::GetStatus => {
@@ -888,6 +917,9 @@ impl ServerImpl {
         // Now that all buffer borrowing is done, we can borrow `self` mutably
         // again to perform any necessary action.
         if let Some(action) = action {
+            // Protectively clear this flag to avoid keeping state.
+            self.host_boot_just_failed = false;
+
             match action {
                 Action::RebootHost => self.power_off_host(true),
                 Action::PowerOffHost => self.power_off_host(false),