From 495a400e8e20e4c487385b0591eb03782284661f Mon Sep 17 00:00:00 2001 From: "Cliff L. Biffle" Date: Wed, 14 Feb 2024 13:48:55 -0800 Subject: [PATCH] host-sp-comms: do not reboot host on boot failure This is in response to #1613. If the host reports a boot failure (such as a phase mismatch, but not limited to that reason) simply rebooting it blindly is unlikely to fix the problem. We need intervention from a higher power (the control plane) to fix the issue. So to avoid a bootloop that wastes energy and overwrites our circular buffers with spam, this change alters the response to the IPCC Request Reboot message if received shortly after a Boot Failed message -- it is interpreted as a power off request. Fixes #1614. --- task/host-sp-comms/src/main.rs | 34 +++++++++++++++++++++++++++++++++- 1 file changed, 33 insertions(+), 1 deletion(-) diff --git a/task/host-sp-comms/src/main.rs b/task/host-sp-comms/src/main.rs index 75a64e363..0a9b000a7 100644 --- a/task/host-sp-comms/src/main.rs +++ b/task/host-sp-comms/src/main.rs @@ -258,6 +258,13 @@ struct ServerImpl { reboot_state: Option, host_kv_storage: HostKeyValueStorage, hf_mux_state: Option, + + /// Set when we receive a Boot Failed message from the host. This flag + /// changes the behavior in response to the next Request Reboot message. It + /// is cleared at Request Reboot, *or* at any of the messages sent during + /// host boot, to try and keep our state from getting out of sync for + /// whatever reason. + host_boot_just_failed: bool, } impl ServerImpl { @@ -290,6 +297,8 @@ impl ServerImpl { reboot_state: None, host_kv_storage: HostKeyValueStorage::claim_static_resources(), hf_mux_state: None, + + host_boot_just_failed: false, } } @@ -701,13 +710,24 @@ impl ServerImpl { // We defer any actions until after we've serialized our response to // avoid borrow checker issues with calling methods on `self`. + // + // NOTE: setting any action will also clear the `host_boot_just_failed` + // flag, to avoid keeping incorrect state around. Since we have no + // explicit way of tracking the host boot software's lifecycle, this is + // a compromise. let mut action = None; + let response = match request { HostToSp::_Unused => { Some(SpToHost::DecodeFailure(DecodeFailureReason::Deserialize)) } HostToSp::RequestReboot => { - action = Some(Action::RebootHost); + if core::mem::take(&mut self.host_boot_just_failed) { + // Do _not_ reboot. Reinterpret this as a poweroff request. + action = Some(Action::PowerOffHost); + } else { + action = Some(Action::RebootHost); + } None } HostToSp::RequestPowerOff => { @@ -777,6 +797,11 @@ impl ServerImpl { for b in &mut self.host_kv_storage.last_boot_fail[n..] { *b = 0; } + + // Record that this failure happened, which will change the + // behavior of the reboot request we expect to follow shortly. + self.host_boot_just_failed = true; + Some(SpToHost::Ack) } HostToSp::HostPanic { .. } => { @@ -793,6 +818,10 @@ impl ServerImpl { for b in &mut self.host_kv_storage.last_panic[n..] { *b = 0; } + + // Neither set nor clear the host_boot_just_failed flag, for + // this message often immediately follows a boot failure. + Some(SpToHost::Ack) } HostToSp::GetStatus => { @@ -888,6 +917,9 @@ impl ServerImpl { // Now that all buffer borrowing is done, we can borrow `self` mutably // again to perform any necessary action. if let Some(action) = action { + // Protectively clear this flag to avoid keeping state. + self.host_boot_just_failed = false; + match action { Action::RebootHost => self.power_off_host(true), Action::PowerOffHost => self.power_off_host(false),