Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add a watchdog timer to catch OTA process getting stuck #16542

Merged
merged 14 commits into from
Mar 25, 2022
101 changes: 87 additions & 14 deletions src/app/clusters/ota-requestor/GenericOTARequestorDriver.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ void GenericOTARequestorDriver::Init(OTARequestorInterface * requestor, OTAImage
else
{
// Start the first periodic query timer
StartDefaultProviderTimer();
StartSelectedTimer(SelectedTimer::kPeriodicQueryTimer);
}
}

Expand Down Expand Up @@ -118,17 +118,23 @@ bool GenericOTARequestorDriver::ProviderLocationsEqual(const ProviderLocationTyp

void GenericOTARequestorDriver::HandleError(UpdateFailureState state, CHIP_ERROR error) {}

void GenericOTARequestorDriver::HandleIdleStateExit()
{
// Start watchdog timer to monitor new Query Image session
StartSelectedTimer(SelectedTimer::kWatchdogTimer);
}

void GenericOTARequestorDriver::HandleIdleState(IdleStateReason reason)
{
switch (reason)
{
case IdleStateReason::kUnknown:
ChipLogProgress(SoftwareUpdate, "Unknown idle state reason so set the periodic timer for a next attempt");
StartDefaultProviderTimer();
StartSelectedTimer(SelectedTimer::kPeriodicQueryTimer);
break;
case IdleStateReason::kIdle:
// There is no current OTA update in progress so start the periodic query timer
StartDefaultProviderTimer();
StartSelectedTimer(SelectedTimer::kPeriodicQueryTimer);
break;
case IdleStateReason::kInvalidSession:
// An invalid session is detected which may be temporary so try to query the same provider again
Expand Down Expand Up @@ -201,7 +207,7 @@ void GenericOTARequestorDriver::UpdateNotFound(UpdateNotFoundReason reason, Syst
else
{
ChipLogProgress(SoftwareUpdate, "UpdateNotFound, not scheduling further retries");
StartDefaultProviderTimer();
StartSelectedTimer(SelectedTimer::kPeriodicQueryTimer);
}
}

Expand Down Expand Up @@ -240,7 +246,7 @@ void GenericOTARequestorDriver::UpdateDiscontinued()
UpdateCancelled();

// Restart the periodic default provider timer
StartDefaultProviderTimer();
StartSelectedTimer(SelectedTimer::kPeriodicQueryTimer);
}

// Cancel all OTA update timers
Expand Down Expand Up @@ -337,14 +343,14 @@ void GenericOTARequestorDriver::SendQueryImage()

// Default provider timer only runs when there is no ongoing query/update; must stop it now.
// TriggerImmediateQueryInternal() will cause the state to change from kIdle
StopDefaultProviderTimer();
StartSelectedTimer(SelectedTimer::kWatchdogTimer);
isiu-apple marked this conversation as resolved.
Show resolved Hide resolved

mProviderRetryCount++;

DeviceLayer::SystemLayer().ScheduleLambda([this] { mRequestor->TriggerImmediateQueryInternal(); });
}

void GenericOTARequestorDriver::DefaultProviderTimerHandler(System::Layer * systemLayer, void * appState)
void GenericOTARequestorDriver::PeriodicQueryTimerHandler(System::Layer * systemLayer, void * appState)
{
ChipLogProgress(SoftwareUpdate, "Default Provider timer handler is invoked");

Expand All @@ -353,7 +359,7 @@ void GenericOTARequestorDriver::DefaultProviderTimerHandler(System::Layer * syst
bool listExhausted = false;
if (GetNextProviderLocation(providerLocation, listExhausted) != true)
{
StartDefaultProviderTimer();
StartSelectedTimer(SelectedTimer::kPeriodicQueryTimer);
return;
}

Expand All @@ -362,28 +368,95 @@ void GenericOTARequestorDriver::DefaultProviderTimerHandler(System::Layer * syst
SendQueryImage();
}

void GenericOTARequestorDriver::StartDefaultProviderTimer()
void GenericOTARequestorDriver::StartPeriodicQueryTimer()
{
ChipLogProgress(SoftwareUpdate, "Starting the Default Provider timer, timeout: %u seconds",
ChipLogProgress(SoftwareUpdate, "Starting the periodic query timer, timeout: %u seconds",
(unsigned int) mPeriodicQueryTimeInterval);
ScheduleDelayedAction(
System::Clock::Seconds32(mPeriodicQueryTimeInterval),
[](System::Layer *, void * context) {
(static_cast<GenericOTARequestorDriver *>(context))->DefaultProviderTimerHandler(nullptr, context);
(static_cast<GenericOTARequestorDriver *>(context))->PeriodicQueryTimerHandler(nullptr, context);
},
this);
}

void GenericOTARequestorDriver::StopDefaultProviderTimer()
void GenericOTARequestorDriver::StopPeriodicQueryTimer()
{
ChipLogProgress(SoftwareUpdate, "Stopping the Default Provider timer");
ChipLogProgress(SoftwareUpdate, "Stopping the Periodic Query timer");
CancelDelayedAction(
[](System::Layer *, void * context) {
(static_cast<GenericOTARequestorDriver *>(context))->DefaultProviderTimerHandler(nullptr, context);
(static_cast<GenericOTARequestorDriver *>(context))->PeriodicQueryTimerHandler(nullptr, context);
},
this);
}

void GenericOTARequestorDriver::WatchdogTimerHandler(System::Layer * systemLayer, void * appState)
{
ChipLogProgress(SoftwareUpdate, "Watchdog timer handler is invoked");

OTAUpdateStateEnum currentState = mRequestor->GetCurrentUpdateState();

switch (currentState)
{
case OTAUpdateStateEnum::kIdle:
case OTAUpdateStateEnum::kUnknown:
// Watchdog timer should not expire in Idle state and periodic timer should already be scheduled.
// Scheduling periodic timer here just in case.
StartPeriodicQueryTimer();
break;
case OTAUpdateStateEnum::kDownloading:
case OTAUpdateStateEnum::kApplying:
case OTAUpdateStateEnum::kDelayedOnApply:
UpdateDiscontinued();
mRequestor->CancelImageUpdate();
StartPeriodicQueryTimer();
break;
case OTAUpdateStateEnum::kQuerying:
case OTAUpdateStateEnum::kDelayedOnQuery:
case OTAUpdateStateEnum::kRollingBack:
case OTAUpdateStateEnum::kDelayedOnUserConsent:
isiu-apple marked this conversation as resolved.
Show resolved Hide resolved
mRequestor->Reset();
StartPeriodicQueryTimer();
break;
}
}

void GenericOTARequestorDriver::StartWatchdogTimer()
{
ChipLogProgress(SoftwareUpdate, "Starting the watchdog timer, timeout: %u seconds", (unsigned int) mWatchdogTimeInterval);
ScheduleDelayedAction(
System::Clock::Seconds32(mWatchdogTimeInterval),
[](System::Layer *, void * context) {
(static_cast<GenericOTARequestorDriver *>(context))->WatchdogTimerHandler(nullptr, context);
},
this);
}

void GenericOTARequestorDriver::StopWatchdogTimer()
{
ChipLogProgress(SoftwareUpdate, "Stopping the watchdog timer");
CancelDelayedAction(
[](System::Layer *, void * context) {
(static_cast<GenericOTARequestorDriver *>(context))->WatchdogTimerHandler(nullptr, context);
},
this);
}

void GenericOTARequestorDriver::StartSelectedTimer(SelectedTimer timer)
{
switch (timer)
{
case SelectedTimer::kPeriodicQueryTimer:
StopWatchdogTimer();
StartPeriodicQueryTimer();
break;
case SelectedTimer::kWatchdogTimer:
StopPeriodicQueryTimer();
StartWatchdogTimer();
break;
}
}

/**
* Returns the next available Provider location. The algorithm is to simply loop through the list of DefaultOtaProviders as a
* circular list and return the next value (based on the last used provider). If the list of DefaultOtaProviders is empty, FALSE is
Expand Down
12 changes: 9 additions & 3 deletions src/app/clusters/ota-requestor/GenericOTARequestorDriver.h
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ class GenericOTARequestorDriver : public OTARequestorDriver
bool CanConsent() override;
uint16_t GetMaxDownloadBlockSize() override;
void HandleError(UpdateFailureState state, CHIP_ERROR error) override;
void HandleIdleStateExit() override;
void HandleIdleState(IdleStateReason reason) override;
void UpdateAvailable(const UpdateDescription & update, System::Clock::Seconds32 delay) override;
void UpdateNotFound(UpdateNotFoundReason reason, System::Clock::Seconds32 delay) override;
Expand All @@ -71,9 +72,13 @@ class GenericOTARequestorDriver : public OTARequestorDriver
bool GetNextProviderLocation(ProviderLocationType & providerLocation, bool & listExhausted) override;

protected:
void StartDefaultProviderTimer();
void StopDefaultProviderTimer();
void DefaultProviderTimerHandler(System::Layer * systemLayer, void * appState);
void StartPeriodicQueryTimer();
isiu-apple marked this conversation as resolved.
Show resolved Hide resolved
void StopPeriodicQueryTimer();
void PeriodicQueryTimerHandler(System::Layer * systemLayer, void * appState);
void StartWatchdogTimer();
void StopWatchdogTimer();
void WatchdogTimerHandler(System::Layer * systemLayer, void * appState);
void StartSelectedTimer(SelectedTimer timer);
void ScheduleDelayedAction(System::Clock::Seconds32 delay, System::TimerCompleteCallback action, void * aAppState);
void CancelDelayedAction(System::TimerCompleteCallback action, void * aAppState);
bool ProviderLocationsEqual(const ProviderLocationType & a, const ProviderLocationType & b);
Expand All @@ -82,6 +87,7 @@ class GenericOTARequestorDriver : public OTARequestorDriver
OTAImageProcessorInterface * mImageProcessor = nullptr;
uint32_t mOtaStartDelaySec = 0;
uint32_t mPeriodicQueryTimeInterval = (24 * 60 * 60); // Timeout for querying providers on the default OTA provider list
uint32_t mWatchdogTimeInterval = (6 * 60 * 60); // Timeout (in seconds) for checking if Requestor has reverted back to idle mode
// Maximum number of times to retry a BUSY OTA provider before moving to the next available one
static constexpr uint8_t kMaxBusyProviderRetryCount = 3;
uint8_t mProviderRetryCount; // Track retry count for the current provider
Expand Down
7 changes: 6 additions & 1 deletion src/app/clusters/ota-requestor/OTARequestor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -400,7 +400,7 @@ void OTARequestor::CancelImageUpdate()

mOtaRequestorDriver->UpdateCancelled();

RecordNewUpdateState(OTAUpdateStateEnum::kIdle, OTAChangeReasonEnum::kUnknown);
Reset();
}

CHIP_ERROR OTARequestor::GetUpdateStateProgressAttribute(EndpointId endpointId, app::DataModel::Nullable<uint8_t> & progress)
Expand Down Expand Up @@ -658,13 +658,18 @@ void OTARequestor::RecordNewUpdateState(OTAUpdateStateEnum newState, OTAChangeRe
}
OtaRequestorServerOnStateTransition(mCurrentUpdateState, newState, reason, targetSoftwareVersion);

// Issue#16151 tracks re-factoring error and state transitioning handling.
if ((newState == OTAUpdateStateEnum::kIdle) && (mCurrentUpdateState != OTAUpdateStateEnum::kIdle))
{
IdleStateReason idleStateReason = MapErrorToIdleStateReason(error);

// Inform the driver that the OTARequestor has entered the Idle state
mOtaRequestorDriver->HandleIdleState(idleStateReason);
}
else if ((mCurrentUpdateState == OTAUpdateStateEnum::kIdle) && (newState != OTAUpdateStateEnum::kIdle))
{
mOtaRequestorDriver->HandleIdleStateExit();
}

mCurrentUpdateState = newState;
}
Expand Down
10 changes: 10 additions & 0 deletions src/app/clusters/ota-requestor/OTARequestorDriver.h
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,13 @@ enum class IdleStateReason
kInvalidSession,
};

// The current selected OTA Requestor timer to be running
enum class SelectedTimer
{
kPeriodicQueryTimer,
kWatchdogTimer,
};

// Interface class to abstract the OTA-related business logic. Each application
// must implement this interface. All calls must be non-blocking unless stated otherwise
class OTARequestorDriver
Expand All @@ -87,6 +94,9 @@ class OTARequestorDriver
/// Called when an error occurs at any OTA requestor operation
virtual void HandleError(UpdateFailureState state, CHIP_ERROR error) = 0;

/// Called when OTA Requestor has exited the Idle state for which the driver may need to take various actions
virtual void HandleIdleStateExit() = 0;

// Called when the OTA Requestor has entered the Idle state for which the driver may need to take various actions
virtual void HandleIdleState(IdleStateReason reason) = 0;

Expand Down