Skip to content

Commit

Permalink
Fix thread races in chip-tool
Browse files Browse the repository at this point in the history
Problem:

This PR achieves the following to fix-up the various thread-races detected by tsan in chip-tool:

Change:

- Following the pattern of 'external synchronization', sprinkled LockChipStack() and UnlockChipStack() calls around key call sites that called into the stack from the various command logic in chip-tool
- Removed usleep and global instance hacks.
- Reverts changes in project-chip#7494
- Re-structured Command::Run to now have the bulk of the stack initialization and shutdown be managed before Run() is called in Commands::Run(), and an ExecutionContext object pointer be stashed inside the Command for convenient access. This reduces the changes of people writing new commands of getting stack initialization wrong.
- Instead of sometimes using chip::Controller::DeviceController and sometimes DeviceCommissioner, just used the latter in all commands since that is the super-set class anyways.
- Added a new 'StopEventLoopTask' that is thread-safe, that is needed to be called by application logic before DeviceController::Shutdown() can be called with external synchronization.
- Pivots PlatformMgr::Shutdown() to not handle stopping the event queue,
  but only focus on cleaning up the stack objects.
- Fixed up TestMdns as well along the way.

Testing:

- Enabled tsan using 'is_tsan' build arg and used that catch well over
  10+ races, with not a single false-positive.
- Ran through all the chip-tool command groups (pairing, IM, discover,
  testcluster, payload, etc) 10x each to ensure no regressions in
  functionality as well as ensuring clean shutdown with tsan.
  • Loading branch information
mrjerryjohns committed Jun 11, 2021
1 parent 6f297c5 commit 7ee2771
Show file tree
Hide file tree
Showing 33 changed files with 463 additions and 198 deletions.
38 changes: 15 additions & 23 deletions examples/chip-tool/commands/clusters/ModelCommand.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -37,39 +37,31 @@ void DispatchSingleClusterCommand(chip::ClusterId aClusterId, chip::CommandId aC
"Default DispatchSingleClusterCommand is called, this should be replaced by actual dispatched for cluster commands");
}

CHIP_ERROR ModelCommand::Run(PersistentStorage & storage, NodeId localId, NodeId remoteId)
CHIP_ERROR ModelCommand::Run(NodeId localId, NodeId remoteId)
{
CHIP_ERROR err = CHIP_NO_ERROR;

chip::Controller::CommissionerInitParams initParams;
initParams.storageDelegate = &storage;

err = mOpCredsIssuer.Initialize(storage);
VerifyOrExit(err == CHIP_NO_ERROR, ChipLogError(Controller, "Init failure! Operational Cred Issuer: %s", ErrorStr(err)));

initParams.operationalCredentialsDelegate = &mOpCredsIssuer;

err = mCommissioner.SetUdpListenPort(storage.GetListenPort());
VerifyOrExit(err == CHIP_NO_ERROR, ChipLogError(Controller, "Init failure! Commissioner: %s", ErrorStr(err)));

err = mCommissioner.Init(localId, initParams);
VerifyOrExit(err == CHIP_NO_ERROR, ChipLogError(Controller, "Init failure! Commissioner: %s", ErrorStr(err)));
//
// Set this to true first BEFORE we send commands to ensure we don't
// end up in a situation where the response comes back faster than we can
// set the variable to true, which will cause it to block indefinitely.
//
UpdateWaitForResponse(true);

{
chip::DeviceLayer::StackLock lock;

err = mCommissioner.ServiceEvents();
VerifyOrExit(err == CHIP_NO_ERROR, ChipLogError(Controller, "Init failure! Run Loop: %s", ErrorStr(err)));
err = GetExecContext()->commissioner->GetDevice(remoteId, &mDevice);
VerifyOrExit(err == CHIP_NO_ERROR, ChipLogError(chipTool, "Init failure! No pairing for device: %" PRIu64, localId));

err = mCommissioner.GetDevice(remoteId, &mDevice);
VerifyOrExit(err == CHIP_NO_ERROR, ChipLogError(chipTool, "Init failure! No pairing for device: %" PRIu64, localId));
err = SendCommand(mDevice, mEndPointId);
VerifyOrExit(err == CHIP_NO_ERROR, ChipLogError(chipTool, "Failed to send message: %s", ErrorStr(err)));
}

UpdateWaitForResponse(true);
err = SendCommand(mDevice, mEndPointId);
VerifyOrExit(err == CHIP_NO_ERROR, ChipLogError(chipTool, "Failed to send message: %s", ErrorStr(err)));
WaitForResponse(kWaitDurationInSeconds);

VerifyOrExit(GetCommandExitStatus(), err = CHIP_ERROR_INTERNAL);

exit:
mCommissioner.ServiceEventSignal();
mCommissioner.Shutdown();
return err;
}
4 changes: 1 addition & 3 deletions examples/chip-tool/commands/clusters/ModelCommand.h
Original file line number Diff line number Diff line change
Expand Up @@ -36,13 +36,11 @@ class ModelCommand : public Command
void AddArguments() { AddArgument("endpoint-id", CHIP_ZCL_ENDPOINT_MIN, CHIP_ZCL_ENDPOINT_MAX, &mEndPointId); }

/////////// Command Interface /////////
CHIP_ERROR Run(PersistentStorage & storage, NodeId localId, NodeId remoteId) override;
CHIP_ERROR Run(NodeId localId, NodeId remoteId) override;

virtual CHIP_ERROR SendCommand(ChipDevice * device, uint8_t endPointId) = 0;

private:
ChipDeviceController mCommissioner;
ChipDevice * mDevice;
chip::Controller::ExampleOperationalCredentialsIssuer mOpCredsIssuer;
uint8_t mEndPointId;
};
21 changes: 20 additions & 1 deletion examples/chip-tool/commands/common/Command.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@

#pragma once

#include "controller/ExampleOperationalCredentialsIssuer.h"
#include <controller/CHIPDeviceController.h>
#include <inet/InetInterface.h>
#include <support/Span.h>
Expand Down Expand Up @@ -90,9 +91,23 @@ class Command
::chip::Inet::InterfaceId interfaceId;
};

/**
* @brief
* Encapsulates key objects in the CHIP stack that need continued
* access, so wrapping it in here makes it nice and compactly encapsulated.
*/
struct ExecutionContext
{
ChipDeviceCommissioner * commissioner;
chip::Controller::ExampleOperationalCredentialsIssuer * opCredsIssuer;
PersistentStorage * storage;
};

Command(const char * commandName) : mName(commandName) {}
virtual ~Command() {}

void SetExecutionContext(ExecutionContext & execContext) { mExecContext = &execContext; }

const char * GetName(void) const { return mName; }
const char * GetAttribute(void) const;
const char * GetArgumentName(size_t index) const;
Expand Down Expand Up @@ -147,7 +162,7 @@ class Command
return AddArgument(name, min, max, reinterpret_cast<void *>(out), Number_uint64);
}

virtual CHIP_ERROR Run(PersistentStorage & storage, NodeId localId, NodeId remoteId) = 0;
virtual CHIP_ERROR Run(NodeId localId, NodeId remoteId) = 0;

bool GetCommandExitStatus() const { return mCommandExitStatus; }
void SetCommandExitStatus(bool status)
Expand All @@ -159,6 +174,10 @@ class Command
void UpdateWaitForResponse(bool value);
void WaitForResponse(uint16_t duration);

protected:
ExecutionContext * GetExecContext() { return mExecContext; }
ExecutionContext * mExecContext;

private:
bool InitArgument(size_t argIndex, char * argValue);
size_t AddArgument(const char * name, int64_t min, uint64_t max, void * out, ArgumentType type);
Expand Down
57 changes: 48 additions & 9 deletions examples/chip-tool/commands/common/Commands.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ void Commands::Register(const char * clusterName, commands_list commandsList)
int Commands::Run(NodeId localId, NodeId remoteId, int argc, char ** argv)
{
CHIP_ERROR err = CHIP_NO_ERROR;
PersistentStorage storage;
chip::Controller::CommissionerInitParams initParams;

err = chip::Platform::MemoryInit();
VerifyOrExit(err == CHIP_NO_ERROR, ChipLogError(Controller, "Init Memory failure: %s", chip::ErrorStr(err)));
Expand All @@ -51,19 +51,48 @@ int Commands::Run(NodeId localId, NodeId remoteId, int argc, char ** argv)
SuccessOrExit(err = chip::DeviceLayer::Internal::BLEMgrImpl().ConfigureBle(/* BLE adapter ID */ 0, /* BLE central */ true));
#endif

err = storage.Init();
err = mStorage.Init();
VerifyOrExit(err == CHIP_NO_ERROR, ChipLogError(Controller, "Init Storage failure: %s", chip::ErrorStr(err)));

chip::Logging::SetLogFilter(storage.GetLoggingLevel());
chip::Logging::SetLogFilter(mStorage.GetLoggingLevel());

err = RunCommand(storage, localId, remoteId, argc, argv);
initParams.storageDelegate = &mStorage;

err = mOpCredsIssuer.Initialize(mStorage);
VerifyOrExit(err == CHIP_NO_ERROR, ChipLogError(Controller, "Init failure! Operational Cred Issuer: %s", chip::ErrorStr(err)));

initParams.operationalCredentialsDelegate = &mOpCredsIssuer;

err = mController.SetUdpListenPort(mStorage.GetListenPort());
VerifyOrExit(err == CHIP_NO_ERROR, ChipLogError(Controller, "Init failure! Commissioner: %s", chip::ErrorStr(err)));

err = mController.Init(localId, initParams);
VerifyOrExit(err == CHIP_NO_ERROR, ChipLogError(Controller, "Init failure! Commissioner: %s", chip::ErrorStr(err)));

err = mController.ServiceEvents();
VerifyOrExit(err == CHIP_NO_ERROR, ChipLogError(Controller, "Init failure! Run Loop: %s", chip::ErrorStr(err)));

err = RunCommand(localId, remoteId, argc, argv);
SuccessOrExit(err);

exit:
#if CONFIG_DEVICE_LAYER
chip::DeviceLayer::PlatformMgr().StopEventLoopTask();
#endif

//
// We can call DeviceController::Shutdown() safely without grabbing the stack lock
// since the CHIP thread and event queue have been stopped, preventing any thread
// races.
//
// TODO: This doesn't hold true on Darwin, issue #7557 tracks the problem.
//
mController.Shutdown();

return (err == CHIP_NO_ERROR) ? EXIT_SUCCESS : EXIT_FAILURE;
}

CHIP_ERROR Commands::RunCommand(PersistentStorage & storage, NodeId localId, NodeId remoteId, int argc, char ** argv)
CHIP_ERROR Commands::RunCommand(NodeId localId, NodeId remoteId, int argc, char ** argv)
{
CHIP_ERROR err = CHIP_NO_ERROR;
std::map<std::string, CommandsVector>::iterator cluster;
Expand Down Expand Up @@ -125,11 +154,21 @@ CHIP_ERROR Commands::RunCommand(PersistentStorage & storage, NodeId localId, Nod
ExitNow(err = CHIP_ERROR_INVALID_ARGUMENT);
}

err = command->Run(storage, localId, remoteId);
if (err != CHIP_NO_ERROR)
{
ChipLogError(chipTool, "Run command failure: %s", chip::ErrorStr(err));
ExitNow();
Command::ExecutionContext execContext;

execContext.commissioner = &mController;
execContext.opCredsIssuer = &mOpCredsIssuer;
execContext.storage = &mStorage;

command->SetExecutionContext(execContext);

err = command->Run(localId, remoteId);
if (err != CHIP_NO_ERROR)
{
ChipLogError(chipTool, "Run command failure: %s", chip::ErrorStr(err));
ExitNow();
}
}

exit:
Expand Down
6 changes: 5 additions & 1 deletion examples/chip-tool/commands/common/Commands.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@

#include "../../config/PersistentStorage.h"
#include "Command.h"
#include <controller/ExampleOperationalCredentialsIssuer.h>
#include <map>

class Commands
Expand All @@ -32,7 +33,7 @@ class Commands
int Run(NodeId localId, NodeId remoteId, int argc, char ** argv);

private:
CHIP_ERROR RunCommand(PersistentStorage & storage, NodeId localId, NodeId remoteId, int argc, char ** argv);
CHIP_ERROR RunCommand(NodeId localId, NodeId remoteId, int argc, char ** argv);
std::map<std::string, CommandsVector>::iterator GetCluster(std::string clusterName);
Command * GetCommand(CommandsVector & commands, std::string commandName);
Command * GetGlobalCommand(CommandsVector & commands, std::string commandName, std::string attributeName);
Expand All @@ -44,4 +45,7 @@ class Commands
void ShowCommand(std::string executable, std::string clusterName, Command * command);

std::map<std::string, CommandsVector> mClusters;
chip::Controller::DeviceCommissioner mController;
chip::Controller::ExampleOperationalCredentialsIssuer mOpCredsIssuer;
PersistentStorage mStorage;
};
4 changes: 2 additions & 2 deletions examples/chip-tool/commands/discover/Commands.h
Original file line number Diff line number Diff line change
Expand Up @@ -66,9 +66,9 @@ class Update : public DiscoverCommand
CHIP_ERROR RunCommand(NodeId remoteId, uint64_t fabricId) override
{
ChipDevice * device;
ReturnErrorOnFailure(mCommissioner.GetDevice(remoteId, &device));
ReturnErrorOnFailure(GetExecContext()->commissioner->GetDevice(remoteId, &device));
ChipLogProgress(chipTool, "Mdns: Updating NodeId: %" PRIx64 " FabricId: %" PRIx64 " ...", remoteId, fabricId);
return mCommissioner.UpdateDevice(device, fabricId);
return GetExecContext()->commissioner->UpdateDevice(device, fabricId);
}

/////////// DeviceAddressUpdateDelegate Interface /////////
Expand Down
34 changes: 20 additions & 14 deletions examples/chip-tool/commands/discover/DiscoverCommand.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,27 +20,33 @@

constexpr uint16_t kWaitDurationInSeconds = 30;

CHIP_ERROR DiscoverCommand::Run(PersistentStorage & storage, NodeId localId, NodeId remoteId)
CHIP_ERROR DiscoverCommand::Run(NodeId localId, NodeId remoteId)
{
chip::Controller::CommissionerInitParams params;
CHIP_ERROR err;

params.storageDelegate = &storage;
params.mDeviceAddressUpdateDelegate = this;
params.operationalCredentialsDelegate = &mOpCredsIssuer;

ReturnErrorOnFailure(mCommissioner.SetUdpListenPort(storage.GetListenPort()));
ReturnErrorOnFailure(mCommissioner.Init(localId, params));
ReturnErrorOnFailure(mCommissioner.ServiceEvents());
//
// Set this to true first BEFORE we send commands to ensure we don't
// end up in a situation where the response comes back faster than we can
// set the variable to true, which will cause it to block indefinitely.
//
UpdateWaitForResponse(true);

{
chip::DeviceLayer::StackLock lock;

ReturnErrorOnFailure(RunCommand(mNodeId, mFabricId));
GetExecContext()->commissioner->RegisterDeviceAddressUpdateDelegate(this);
err = RunCommand(mNodeId, mFabricId);
SuccessOrExit(err);
}

UpdateWaitForResponse(true);
WaitForResponse(kWaitDurationInSeconds);

mCommissioner.ServiceEventSignal();
mCommissioner.Shutdown();
exit:
if (err != CHIP_NO_ERROR)
{
return err;
}

VerifyOrReturnError(GetCommandExitStatus(), CHIP_ERROR_INTERNAL);

return CHIP_NO_ERROR;
}
6 changes: 1 addition & 5 deletions examples/chip-tool/commands/discover/DiscoverCommand.h
Original file line number Diff line number Diff line change
Expand Up @@ -35,15 +35,11 @@ class DiscoverCommand : public Command, public chip::Controller::DeviceAddressUp
void OnAddressUpdateComplete(NodeId nodeId, CHIP_ERROR error) override{};

/////////// Command Interface /////////
CHIP_ERROR Run(PersistentStorage & storage, NodeId localId, NodeId remoteId) override;
CHIP_ERROR Run(NodeId localId, NodeId remoteId) override;

virtual CHIP_ERROR RunCommand(NodeId remoteId, uint64_t fabricId) = 0;

protected:
ChipDeviceCommissioner mCommissioner;

private:
chip::NodeId mNodeId;
uint64_t mFabricId;
chip::Controller::ExampleOperationalCredentialsIssuer mOpCredsIssuer;
};
Loading

0 comments on commit 7ee2771

Please sign in to comment.