diff --git a/Cargo.lock b/Cargo.lock index 9f7d36a32..893a5d72c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4881,8 +4881,8 @@ dependencies = [ [[package]] name = "transceiver-messages" -version = "0.1.0" -source = "git+https://github.com/oxidecomputer/transceiver-control/#142ebb7ac603ea8541c42f53c4a5a34f1176114a" +version = "0.1.1" +source = "git+https://github.com/oxidecomputer/transceiver-control/#84e28d1263d9d07c5410fb0644469c8eb7b5fb5f" dependencies = [ "bitflags 2.1.0", "hubpack", diff --git a/drv/transceivers-server/src/main.rs b/drv/transceivers-server/src/main.rs index 55067b38a..b6fb6df85 100644 --- a/drv/transceivers-server/src/main.rs +++ b/drv/transceivers-server/src/main.rs @@ -57,6 +57,7 @@ enum Trace { GotInterface(u8, ManagementInterface), UnknownInterface(u8, ManagementInterface), UnpluggedModule(usize), + RemovedDisabledModule(usize), TemperatureReadError(usize, Reg::QSFP::PORT0_STATUS::Encoded), TemperatureReadUnexpectedError(usize, FpgaError), SensorError(usize, SensorError), @@ -64,11 +65,26 @@ enum Trace { GetInterfaceError(usize, Reg::QSFP::PORT0_STATUS::Encoded), GetInterfaceUnexpectedError(usize, FpgaError), InvalidPortStatusError(usize, u8), + DisablingPorts(LogicalPortMask), + DisableFailed(usize, LogicalPortMask), } ringbuf!(Trace, 16, Trace::None); //////////////////////////////////////////////////////////////////////////////// +/// After seeing this many NACKs, we disable the port by policy. +/// +/// This should be **very rare**: it requires a transceiver to correctly report +/// its type (SFF vs CMIS) over I2C when it's first plugged in, but then begin +/// NACKing while still physically present (according to the `modprsl` pin). +/// +/// Despite the weirdness of these pre-requisites, we've seen this happen once +/// already; without handling it, the thermal loop will eventually shut down the +/// whole system (because the transceiver stops reporting its temperature). +const MAX_CONSECUTIVE_NACKS: u8 = 3; + +//////////////////////////////////////////////////////////////////////////////// + #[derive(Copy, Clone)] struct LedStates([LedState; NUM_PORTS as usize]); @@ -85,6 +101,12 @@ struct ServerImpl { blink_on: bool, system_led_state: LedState, + /// Modules that are physically present but disabled by Hubris + disabled: LogicalPortMask, + + /// Number of consecutive NACKS seen on a given port + consecutive_nacks: [u8; NUM_PORTS as usize], + /// Handle to write thermal models and presence to the `thermal` task thermal_api: Thermal, @@ -308,7 +330,8 @@ impl ServerImpl { let mask = 1 << i; let operational = (!status.modprsl & mask) != 0 && (status.power_good & mask) != 0 - && (status.resetl & mask) != 0; + && (status.resetl & mask) != 0 + && (self.disabled & port).is_empty(); // A wild transceiver just appeared! Read it to decide whether it's // using SFF-8636 or CMIS. @@ -352,11 +375,18 @@ impl ServerImpl { ringbuf_entry!(Trace::SensorError(i, e)); } - ringbuf_entry!(Trace::UnpluggedModule(i)); + if (self.disabled & port).is_empty() { + ringbuf_entry!(Trace::UnpluggedModule(i)); + } else { + ringbuf_entry!(Trace::RemovedDisabledModule(i)); + } self.thermal_models[i] = None; } } + // Accumulate ports to disable (but don't disable them in the loop), to + // avoid issues with the borrow checker. + let mut to_disable = LogicalPortMask(0); for (i, m) in self.thermal_models.iter().enumerate() { let port = LogicalPort(i as u8); let m = match m { @@ -386,6 +416,7 @@ impl ServerImpl { continue; } }; + let mut got_nack = false; match temperature { Ok(t) => { // We got a temperature! Send it over to the thermal task @@ -403,8 +434,10 @@ impl ServerImpl { // be transient (and we'll remove the transceiver on the // next pass through this function). Err(FpgaError::ImplError(e)) => { - match Reg::QSFP::PORT0_STATUS::Encoded::from_u8(e) { + use Reg::QSFP::PORT0_STATUS::Encoded; + match Encoded::from_u8(e) { Some(val) => { + got_nack |= matches!(val, Encoded::I2cAddressNack); ringbuf_entry!(Trace::TemperatureReadError(i, val)) } None => { @@ -417,7 +450,41 @@ impl ServerImpl { ringbuf_entry!(Trace::TemperatureReadUnexpectedError(i, e)); } } + + self.consecutive_nacks[i] = if got_nack { + self.consecutive_nacks[i].saturating_add(1) + } else { + 0 + }; + + if self.consecutive_nacks[i] >= MAX_CONSECUTIVE_NACKS { + to_disable.set(port); + } + } + if !to_disable.is_empty() { + self.disable_ports(to_disable); + } + } + + fn disable_ports(&mut self, mask: LogicalPortMask) { + ringbuf_entry!(Trace::DisablingPorts(mask)); + for (step, f) in [ + Transceivers::assert_reset, + Transceivers::deassert_lpmode, + Transceivers::disable_power, + ] + .iter() + .enumerate() + { + let err = f(&mut self.transceivers, mask).error(); + if !err.is_empty() { + ringbuf_entry!(Trace::DisableFailed(step, err)); + } } + self.disabled |= mask; + // We don't modify self.thermal_models here; that's left to + // `update_thermal_loop`, which is in charge of communicating with + // the `sensors` and `thermal` tasks. } } @@ -555,6 +622,8 @@ fn main() -> ! { led_states: LedStates([LedState::Off; NUM_PORTS as usize]), blink_on: false, system_led_state: LedState::Off, + disabled: LogicalPortMask(0), + consecutive_nacks: [0; NUM_PORTS as usize], thermal_api, sensor_api, thermal_models: [None; NUM_PORTS as usize], diff --git a/drv/transceivers-server/src/udp.rs b/drv/transceivers-server/src/udp.rs index 7f79524d2..e8f18f07e 100644 --- a/drv/transceivers-server/src/udp.rs +++ b/drv/transceivers-server/src/udp.rs @@ -42,6 +42,7 @@ enum Trace { EnablePower(ModuleId), DisablePower(ModuleId), Status(ModuleId), + ExtendedStatus(ModuleId), Read(ModuleId, MemoryRead), Write(ModuleId, MemoryWrite), ManagementInterface(ManagementInterface), @@ -57,6 +58,7 @@ enum Trace { ClearPowerFault(ModuleId), LedState(ModuleId), SetLedState(ModuleId, LedState), + ClearDisableLatch(ModuleId), } ringbuf!(Trace, 16, Trace::None); @@ -321,6 +323,28 @@ impl ServerImpl { final_payload_len, ) } + HostRequest::ExtendedStatus(modules) => { + ringbuf_entry!(Trace::ExtendedStatus(modules)); + let mask = LogicalPortMask::from(modules); + let (num_status_bytes, result) = + self.get_extended_status(mask, out); + ringbuf_entry!(Trace::OperationNoFailResult(result)); + let success = ModuleId::from(result.success()); + let (err_len, errored_modules) = self.handle_errors( + modules, + result, + &mut out[num_status_bytes..], + ); + let final_payload_len = num_status_bytes + err_len; + + ( + MessageBody::SpResponse(SpResponse::ExtendedStatus { + modules: success, + failed_modules: errored_modules, + }), + final_payload_len, + ) + } HostRequest::Read { modules, read } => { ringbuf_entry!(Trace::Read(modules, read)); // The host is not setting the the upper 32 bits at this time, @@ -329,9 +353,11 @@ impl ServerImpl { let num_invalid = ModuleId(modules.0 & 0xffffffff00000000) .selected_transceiver_count(); let mask = LogicalPortMask::from(modules); + let num_disabled = (mask & self.disabled).count(); let read_data = read.len() as usize * mask.count(); - let invalid_module_err = HwError::MAX_SIZE * num_invalid; - if read_data + invalid_module_err + let module_err = + HwError::MAX_SIZE * (num_invalid + num_disabled); + if read_data + module_err > transceiver_messages::MAX_PAYLOAD_SIZE { return ( @@ -340,12 +366,12 @@ impl ServerImpl { ); } - let result = self.read(read, mask, out); + let result = self.read(read, mask & !self.disabled, out); ringbuf_entry!(Trace::OperationResult(result)); let success = ModuleId::from(result.success()); let read_bytes = result.success().count() * read.len() as usize; let (err_len, failed_modules) = self - .handle_errors_and_failures( + .handle_errors_and_failures_and_disabled( modules, result, HwError::I2cError, @@ -374,11 +400,11 @@ impl ServerImpl { ); } let mask = LogicalPortMask::from(modules); - let result = self.write(write, mask, data); + let result = self.write(write, mask & !self.disabled, data); ringbuf_entry!(Trace::OperationResult(result)); let success = ModuleId::from(result.success()); let (num_err_bytes, failed_modules) = self - .handle_errors_and_failures( + .handle_errors_and_failures_and_disabled( modules, result, HwError::I2cError, @@ -396,12 +422,12 @@ impl ServerImpl { } HostRequest::AssertReset(modules) => { ringbuf_entry!(Trace::AssertReset(modules)); - let mask = LogicalPortMask::from(modules); + let mask = LogicalPortMask::from(modules) & !self.disabled; let result = self.transceivers.assert_reset(mask); ringbuf_entry!(Trace::OperationNoFailResult(result)); let success = ModuleId::from(result.success()); let (num_err_bytes, failed_modules) = - self.handle_errors(modules, result, out); + self.handle_errors_and_disabled(modules, result, out); ( MessageBody::SpResponse(SpResponse::Ack { @@ -413,12 +439,12 @@ impl ServerImpl { } HostRequest::DeassertReset(modules) => { ringbuf_entry!(Trace::DeassertReset(modules)); - let mask = LogicalPortMask::from(modules); + let mask = LogicalPortMask::from(modules) & !self.disabled; let result = self.transceivers.deassert_reset(mask); ringbuf_entry!(Trace::OperationNoFailResult(result)); let success = ModuleId::from(result.success()); let (num_err_bytes, failed_modules) = - self.handle_errors(modules, result, out); + self.handle_errors_and_disabled(modules, result, out); ( MessageBody::SpResponse(SpResponse::Ack { @@ -430,12 +456,12 @@ impl ServerImpl { } HostRequest::AssertLpMode(modules) => { ringbuf_entry!(Trace::AssertLpMode(modules)); - let mask = LogicalPortMask::from(modules); + let mask = LogicalPortMask::from(modules) & !self.disabled; let result = self.transceivers.assert_lpmode(mask); ringbuf_entry!(Trace::OperationNoFailResult(result)); let success = ModuleId::from(result.success()); let (num_err_bytes, failed_modules) = - self.handle_errors(modules, result, out); + self.handle_errors_and_disabled(modules, result, out); ( MessageBody::SpResponse(SpResponse::Ack { @@ -447,12 +473,12 @@ impl ServerImpl { } HostRequest::DeassertLpMode(modules) => { ringbuf_entry!(Trace::DeassertLpMode(modules)); - let mask = LogicalPortMask::from(modules); + let mask = LogicalPortMask::from(modules) & !self.disabled; let result = self.transceivers.deassert_lpmode(mask); ringbuf_entry!(Trace::OperationNoFailResult(result)); let success = ModuleId::from(result.success()); let (num_err_bytes, failed_modules) = - self.handle_errors(modules, result, out); + self.handle_errors_and_disabled(modules, result, out); ( MessageBody::SpResponse(SpResponse::Ack { @@ -464,12 +490,12 @@ impl ServerImpl { } HostRequest::EnablePower(modules) => { ringbuf_entry!(Trace::EnablePower(modules)); - let mask = LogicalPortMask::from(modules); + let mask = LogicalPortMask::from(modules) & !self.disabled; let result = self.transceivers.enable_power(mask); ringbuf_entry!(Trace::OperationNoFailResult(result)); let success = ModuleId::from(result.success()); let (num_err_bytes, failed_modules) = - self.handle_errors(modules, result, out); + self.handle_errors_and_disabled(modules, result, out); ( MessageBody::SpResponse(SpResponse::Ack { @@ -481,12 +507,12 @@ impl ServerImpl { } HostRequest::DisablePower(modules) => { ringbuf_entry!(Trace::DisablePower(modules)); - let mask = LogicalPortMask::from(modules); + let mask = LogicalPortMask::from(modules) & !self.disabled; let result = self.transceivers.disable_power(mask); ringbuf_entry!(Trace::OperationNoFailResult(result)); let success = ModuleId::from(result.success()); let (num_err_bytes, failed_modules) = - self.handle_errors(modules, result, out); + self.handle_errors_and_disabled(modules, result, out); ( MessageBody::SpResponse(SpResponse::Ack { @@ -524,12 +550,12 @@ impl ServerImpl { } HostRequest::ClearPowerFault(modules) => { ringbuf_entry!(Trace::ClearPowerFault(modules)); - let mask = LogicalPortMask::from(modules); + let mask = LogicalPortMask::from(modules) & !self.disabled; let result = self.transceivers.clear_power_fault(mask); ringbuf_entry!(Trace::OperationNoFailResult(result)); let success = ModuleId::from(result.success()); let (num_err_bytes, failed_modules) = - self.handle_errors(modules, result, out); + self.handle_errors_and_disabled(modules, result, out); ( MessageBody::SpResponse(SpResponse::Ack { @@ -582,14 +608,37 @@ impl ServerImpl { num_err_bytes, ) } + HostRequest::ClearDisableLatch(modules) => { + ringbuf_entry!(Trace::ClearDisableLatch(modules)); + let mask = LogicalPortMask::from(modules); + self.disabled &= !mask; + // This operation just sets internal SP state, so it is always + // successful. However, invalid modules may been specified by + // the host so we need to check that anyway. + let result = + ModuleResultNoFailure::new(mask, LogicalPortMask(0)) + .unwrap(); + let success = ModuleId::from(result.success()); + let (num_err_bytes, failed_modules) = + self.handle_errors(modules, result, out); + ( + MessageBody::SpResponse(SpResponse::Ack { + modules: success, + failed_modules, + }), + num_err_bytes, + ) + } } } /// This function reads a `ModuleResult` and populates and failure or error - /// information at the end of the trailing data buffer. This means it should - /// be called as the last operation before sending the response. For results - /// where a `ModuleResultNoFailure` is returned, use handle_errors instead. - fn handle_errors_and_failures( + /// information at the end of the trailing data buffer, taking + /// `self.disabled` into account. This means it should be called as the last + /// operation before sending the response. For results where a + /// `ModuleResultNoFailure` is returned, use `handle_errors` or + /// `handle_errors_and_disabled` instead. + fn handle_errors_and_failures_and_disabled( &mut self, modules: ModuleId, result: ModuleResult, @@ -599,10 +648,22 @@ impl ServerImpl { let mut error_idx: usize = 0; // any modules at index 32->63 are not currently supported. let invalid_modules = ModuleId(0xffffffff00000000); - let requested_invalid_modules = ModuleId(modules.0 & invalid_modules.0); + let requested_invalid_modules = modules & invalid_modules; + + let disabled: ModuleId = self.disabled.into(); + let requested_disabled_modules = modules & disabled; + for module in modules.to_indices().map(LogicalPort) { if module <= LogicalPortMask::MAX_PORT_INDEX { - if result.failure().is_set(module) { + if requested_disabled_modules.is_set(module.0).unwrap() { + // let the host know it requested disabled modules + let err_size = hubpack::serialize( + &mut out[error_idx..], + &HwError::DisabledBySp, + ) + .unwrap(); + error_idx += err_size; + } else if result.failure().is_set(module) { // failure: whatever `HwError` specified by `failure_type` let err_size = hubpack::serialize( &mut out[error_idx..], @@ -636,6 +697,7 @@ impl ServerImpl { error_idx, ModuleId( requested_invalid_modules.0 + | requested_disabled_modules.0 | result.failure().0 as u64 | result.error().0 as u64, ), @@ -643,10 +705,10 @@ impl ServerImpl { } /// This function reads a `ModuleResultNoFailure` and populates error - /// information at the end of the trailing data buffer. This means it should - /// be called as the last operation before sending the response. For results - /// where a `ModuleResult` is returned, use handle_errors_and_failures - /// instead. + /// information at the end of the trailing data buffer, ignoring + /// `self.disabled`. This means it should be called as the last operation + /// before sending the response. For results where a `ModuleResult` is + /// returned, use `handle_errors_and_failures_and_disabled` instead. fn handle_errors( &mut self, modules: ModuleId, @@ -656,7 +718,7 @@ impl ServerImpl { let mut error_idx: usize = 0; // any modules at index 32->63 are not currently supported. let invalid_modules = ModuleId(0xffffffff00000000); - let requested_invalid_modules = ModuleId(modules.0 & invalid_modules.0); + let requested_invalid_modules = modules & invalid_modules; for module in modules.to_indices().map(LogicalPort) { if module <= LogicalPortMask::MAX_PORT_INDEX && result.error().is_set(module) @@ -687,6 +749,68 @@ impl ServerImpl { ) } + /// This function reads a `ModuleResultNoFailure` and populates error + /// information at the end of the trailing data buffer, taking `self.data` + /// into account. This means it should be called as the last operation + /// before sending the response. For results where a `ModuleResult` is + /// returned, use `handle_errors_and_failures_and_disabled` instead. + fn handle_errors_and_disabled( + &mut self, + modules: ModuleId, + result: ModuleResultNoFailure, + out: &mut [u8], + ) -> (usize, ModuleId) { + let mut error_idx: usize = 0; + // any modules at index 32->63 are not currently supported. + let invalid_modules = ModuleId(0xffffffff00000000); + let requested_invalid_modules = modules & invalid_modules; + + // any modules that are listed in self.disabled are also not supported + let disabled: ModuleId = self.disabled.into(); + let requested_disabled_modules = modules & disabled; + + for module in modules.to_indices().map(LogicalPort) { + if module <= LogicalPortMask::MAX_PORT_INDEX { + if requested_disabled_modules.is_set(module.0).unwrap() { + // let the host know it requested unsupported modules + let err_size = hubpack::serialize( + &mut out[error_idx..], + &HwError::DisabledBySp, + ) + .unwrap(); + error_idx += err_size; + } else if result.error().is_set(module) { + // error: fpga communication issue + let err_size = hubpack::serialize( + &mut out[error_idx..], + &HwError::FpgaError, + ) + .unwrap(); + error_idx += err_size; + } + } else if requested_invalid_modules.is_set(module.0).unwrap() { + // let the host know it requested unsupported modules + let err_size = hubpack::serialize( + &mut out[error_idx..], + &HwError::InvalidModuleIndex, + ) + .unwrap(); + error_idx += err_size; + } + } + + // let the caller know how many error bytes we appended and which + // modules had problems + ( + error_idx, + ModuleId( + requested_invalid_modules.0 + | requested_disabled_modules.0 + | result.error().0 as u64, + ), + ) + } + fn get_status( &mut self, modules: LogicalPortMask, @@ -742,6 +866,63 @@ impl ServerImpl { (count, desired_result) } + fn get_extended_status( + &mut self, + modules: LogicalPortMask, + out: &mut [u8], + ) -> (usize, ModuleResultNoFailure) { + // This will get the status of every module, so we will have to only + // select the data which was requested. + let (mod_status, full_result) = self.transceivers.get_module_status(); + // adjust the result success mask to be only our requested modules + let desired_result = ModuleResultNoFailure::new( + full_result.success() & modules, + full_result.error() & modules, + ) + .unwrap(); + + // Write one u32 bitfield per active port in the ModuleId which was + // successfully retrieved above. + let mut count = 0; + for mask in modules + .to_indices() + .filter(|&p| desired_result.success().is_set(p)) + .map(|p| p.as_mask()) + { + let mut status = ExtendedStatus::empty(); + if (mod_status.power_enable & mask.0) != 0 { + status |= ExtendedStatus::ENABLED; + } + if (!mod_status.resetl & mask.0) != 0 { + status |= ExtendedStatus::RESET; + } + if (mod_status.lpmode_txdis & mask.0) != 0 { + status |= ExtendedStatus::LOW_POWER_MODE; + } + if (!mod_status.modprsl & mask.0) != 0 { + status |= ExtendedStatus::PRESENT; + } + if (!mod_status.intl_rxlosl & mask.0) != 0 { + status |= ExtendedStatus::INTERRUPT; + } + if (mod_status.power_good & mask.0) != 0 { + status |= ExtendedStatus::POWER_GOOD; + } + if (mod_status.power_good_timeout & mask.0) != 0 { + status |= ExtendedStatus::FAULT_POWER_TIMEOUT; + } + if (mod_status.power_good_fault & mask.0) != 0 { + status |= ExtendedStatus::FAULT_POWER_LOST; + } + if !(self.disabled & mask).is_empty() { + status |= ExtendedStatus::DISABLED_BY_SP; + } + count += + hubpack::serialize(&mut out[count..], &status.bits()).unwrap(); + } + (count, desired_result) + } + fn select_page( &mut self, page: Page, diff --git a/task/thermal/src/control.rs b/task/thermal/src/control.rs index 0a4d0da79..eed2c8094 100644 --- a/task/thermal/src/control.rs +++ b/task/thermal/src/control.rs @@ -226,11 +226,6 @@ pub struct ThermalSensorErrors { } impl ThermalSensorErrors { - #[allow(dead_code)] - pub fn is_empty(&self) -> bool { - self.next == 0 - } - pub fn clear(&mut self) { self.values = Default::default(); self.next = 0; @@ -1056,6 +1051,7 @@ impl<'a> ThermalControl<'a> { // ensuring that we'll wait for that channel to provide us with at least // one valid reading before resuming the PID loop. if self.dynamic_inputs[index].is_none() { + ringbuf_entry!(Trace::AddedDynamicInput(index)); self.dynamic_inputs[index] = Some(DynamicInputChannel { model }); self.reset_state(); } @@ -1069,6 +1065,7 @@ impl<'a> ThermalControl<'a> { if index >= bsp::NUM_DYNAMIC_TEMPERATURE_INPUTS { Err(ThermalError::InvalidIndex) } else { + ringbuf_entry!(Trace::RemovedDynamicInput(index)); self.dynamic_inputs[index] = None; // Post this reading to the sensors task as well diff --git a/task/thermal/src/main.rs b/task/thermal/src/main.rs index 220c4048b..960b61356 100644 --- a/task/thermal/src/main.rs +++ b/task/thermal/src/main.rs @@ -76,6 +76,8 @@ enum Trace { FanAdded(Fan), FanRemoved(Fan), PowerDownAt(u64), + AddedDynamicInput(usize), + RemovedDynamicInput(usize), } ringbuf!(Trace, 32, Trace::None);