Skip to content

Commit

Permalink
Warm reboot: Add support for orchagent pre-shutdown warm-restart stat…
Browse files Browse the repository at this point in the history
…e check (sonic-net#562)

* Add orchagent pre-warm-restart check mechanism
*  Add orchagent_restart_check options: --noFreeze & --skipPendingTaskCheck
* Add waitTime option for response from orchagent
* Fix build issue with latest master
* adapt to new dvs.runcmd() signature
* Move standard header before local headers
  • Loading branch information
jipanyang authored and qiluo-msft committed Sep 15, 2018
1 parent 41e61bd commit 9fda944
Show file tree
Hide file tree
Showing 8 changed files with 342 additions and 3 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ neighsyncd/neighsyncd
portsyncd/portsyncd
orchagent/orchagent
orchagent/routeresync
orchagent/orchagent_restart_check
swssconfig/swssconfig
swssconfig/swssplayer
tests/tests
6 changes: 5 additions & 1 deletion orchagent/Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ dist_swss_DATA = \
pfc_detect_barefoot.lua \
pfc_restore.lua

bin_PROGRAMS = orchagent routeresync
bin_PROGRAMS = orchagent routeresync orchagent_restart_check

if DEBUG
DBGFLAGS = -ggdb -DDEBUG
Expand Down Expand Up @@ -86,3 +86,7 @@ routeresync_SOURCES = routeresync.cpp
routeresync_CFLAGS = $(DBGFLAGS) $(AM_CFLAGS) $(CFLAGS_COMMON)
routeresync_CPPFLAGS = $(DBGFLAGS) $(AM_CFLAGS) $(CFLAGS_COMMON)
routeresync_LDADD = -lswsscommon

orchagent_restart_check_SOURCES = orchagent_restart_check.cpp
orchagent_restart_check_CPPFLAGS = $(DBGFLAGS) $(AM_CPPFLAGS) $(CFLAGS_COMMON)
orchagent_restart_check_LDADD = -lhiredis -lswsscommon -lpthread
145 changes: 145 additions & 0 deletions orchagent/orchagent_restart_check.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,145 @@
#include <iostream>
#include <sstream>

#include <unistd.h>
#include <getopt.h>

#include "notificationproducer.h"
#include "notificationconsumer.h"
#include "select.h"
#include "logger.h"


void printUsage()
{
SWSS_LOG_ENTER();

std::cout << "Usage: orchagent_restart_check [-s] " << std::endl;
std::cout << " -n --noFreeze" << std::endl;
std::cout << " Don't freeze orchagent even if check succeeded" << std::endl;
std::cout << " -s --skipPendingTaskCheck" << std::endl;
std::cout << " Skip pending task dependency check for orchagent" << std::endl;
std::cout << " -w --waitTime" << std::endl;
std::cout << " Wait time for response from orchagent, in milliseconds. Default value: 1000" << std::endl;
std::cout << " -h --help:" << std::endl;
std::cout << " Print out this message" << std::endl;
}


/*
* Before stopping orchagent for warm restart, basic state check is preferred to
* ensure orchagent is not in transient state, so a deterministic state may be restored after restart.
*
* Here is to implement orchagent_restart_check binary which may talk to orchagent and
* ask it to do self-check, return "READY " signal and freeze if everything is ok,
* otherwise "NOT_READY" signal should be returned.
*
* Optionally:
* if --noFreeze option is provided, orchagent won't freeze.
* if --skipPendingTaskCheck option is provided, orchagent won't use
* whether there is pending task existing as state check criterion.
*/
int main(int argc, char **argv)
{
swss::Logger::getInstance().setMinPrio(swss::Logger::SWSS_INFO);
SWSS_LOG_ENTER();

std::string skipPendingTaskCheck = "fasle";
std::string noFreeze = "fasle";
/* Default wait time is 1000 millisecond */
int waitTime = 1000;

const char* const optstring = "nsw:";
while(true)
{
static struct option long_options[] =
{
{ "noFreeze", no_argument, 0, 'n' },
{ "skipPendingTaskCheck", no_argument, 0, 's' },
{ "waitTime", required_argument, 0, 'w' }
};

int option_index = 0;

int c = getopt_long(argc, argv, optstring, long_options, &option_index);

if (c == -1)
{
break;
}

switch (c)
{
case 'n':
SWSS_LOG_NOTICE("Won't freeze orchagent even if check succeeded");
noFreeze = "true";
break;
case 's':
SWSS_LOG_NOTICE("Skipping pending task check for orchagent");
skipPendingTaskCheck = "true";
break;
case 'w':
SWSS_LOG_NOTICE("Wait time for response from orchagent set to %s milliseconds", optarg);
waitTime = atoi(optarg);
break;
case 'h':
printUsage();
exit(EXIT_SUCCESS);

case '?':
SWSS_LOG_WARN("unknown option %c", optopt);
printUsage();
exit(EXIT_FAILURE);

default:
SWSS_LOG_ERROR("getopt_long failure");
exit(EXIT_FAILURE);
}
}

swss::DBConnector db(APPL_DB, swss::DBConnector::DEFAULT_UNIXSOCKET, 0);
// Send warm restart query via "RESTARTCHECK" notification channel
swss::NotificationProducer restartQuery(&db, "RESTARTCHECK");
// Will listen for the reply on "RESTARTCHECKREPLY" channel
swss::NotificationConsumer restartQueryReply(&db, "RESTARTCHECKREPLY");

std::vector<swss::FieldValueTuple> values;
values.emplace_back("NoFreeze", noFreeze);
values.emplace_back("SkipPendingTaskCheck", skipPendingTaskCheck);
std::string op = "orchagent";
SWSS_LOG_NOTICE("requested %s to do warm restart state check", op.c_str());
restartQuery.send(op, op, values);


swss::Select s;
s.addSelectable(&restartQueryReply);
swss::Selectable *sel;
std::string op_ret, data;
values.clear();
int result = s.select(&sel, waitTime);
if (result == swss::Select::OBJECT)
{
restartQueryReply.pop(op_ret, data, values);
if (data == "READY")
{
SWSS_LOG_NOTICE("RESTARTCHECK success, %s is frozen and ready for warm restart", op_ret.c_str());
std::cout << "RESTARTCHECK succeeded" << std::endl;
return EXIT_SUCCESS;
}
else
{
SWSS_LOG_NOTICE("RESTARTCHECK failed, %s is not ready for warm restart with status %s",
op_ret.c_str(), data.c_str());
}
}
else if (result == swss::Select::TIMEOUT)
{
SWSS_LOG_NOTICE("RESTARTCHECK for %s timed out", op_ret.c_str());
}
else
{
SWSS_LOG_NOTICE("RESTARTCHECK for %s error", op_ret.c_str());
}
std::cout << "RESTARTCHECK failed" << std::endl;
return EXIT_FAILURE;
}
60 changes: 60 additions & 0 deletions orchagent/orchdaemon.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
#include <unistd.h>
#include <unordered_map>
#include <limits.h>
#include "orchdaemon.h"
#include "logger.h"
#include <sairedis.h>
Expand Down Expand Up @@ -343,6 +344,26 @@ void OrchDaemon::start()
* is a good chance to flush the pipeline before next select happened.
*/
flush();

/*
* Asked to check warm restart readiness.
* Not doing this under Select::TIMEOUT condition because of
* the existence of finer granularity ExecutableTimer with select
*/
if (gSwitchOrch->checkRestartReady())
{
bool ret = warmRestartCheck();
if (ret)
{
// Orchagent is ready to perform warm restart, stop processing any new db data.
// Should sleep here or continue handling timers and etc.??
if (!gSwitchOrch->checkRestartNoFreeze())
{
SWSS_LOG_WARN("Orchagent is frozen for warm restart!");
sleep(UINT_MAX);
}
}
}
}
}

Expand Down Expand Up @@ -435,3 +456,42 @@ bool OrchDaemon::warmRestoreValidation()
WarmStart::setWarmStartState("orchagent", WarmStart::RESTORED);
return true;
}

/*
* Reply with "READY" notification if no pending tasks, and return true.
* Ortherwise reply with "NOT_READY" notification and return false.
* Further consideration is needed as to when orchagent is treated as warm restart ready.
* For now, no pending task should exist in any orch agent.
*/
bool OrchDaemon::warmRestartCheck()
{
std::vector<swss::FieldValueTuple> values;
std::string op = "orchagent";
std::string data = "READY";
bool ret = true;

vector<string> ts;
getTaskToSync(ts);

if (ts.size() != 0)
{
SWSS_LOG_NOTICE("WarmRestart check found pending tasks: ");
for(auto &s : ts)
{
SWSS_LOG_NOTICE(" %s", s.c_str());
}
if (!gSwitchOrch->skipPendingTaskCheck())
{
data = "NOT_READY";
ret = false;
}
else
{
SWSS_LOG_NOTICE("Orchagent objects dependency check skipped");
}
}

SWSS_LOG_NOTICE("Restart check result: %s", data.c_str());
gSwitchOrch->restartCheckReply(op, data, values);
return ret;
}
2 changes: 2 additions & 0 deletions orchagent/orchdaemon.h
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,8 @@ class OrchDaemon
bool warmRestoreAndSyncUp();
void getTaskToSync(vector<string> &ts);
bool warmRestoreValidation();

bool warmRestartCheck();
private:
DBConnector *m_applDb;
DBConnector *m_configDb;
Expand Down
56 changes: 55 additions & 1 deletion orchagent/switchorch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@

#include "switchorch.h"
#include "converter.h"
#include "notifier.h"
#include "notificationproducer.h"

using namespace std;
using namespace swss;
Expand All @@ -27,8 +29,12 @@ const map<string, sai_packet_action_t> packet_action_map =
};

SwitchOrch::SwitchOrch(DBConnector *db, string tableName) :
Orch(db, tableName)
Orch(db, tableName),
m_db(db)
{
m_restartCheckNotificationConsumer = new NotificationConsumer(db, "RESTARTCHECK");
auto restartCheckNotifier = new Notifier(m_restartCheckNotificationConsumer, this, "RESTARTCHECK");
Orch::addExecutor(restartCheckNotifier);
}

void SwitchOrch::doTask(Consumer &consumer)
Expand Down Expand Up @@ -122,3 +128,51 @@ void SwitchOrch::doTask(Consumer &consumer)
}
}

void SwitchOrch::doTask(NotificationConsumer& consumer)
{
SWSS_LOG_ENTER();

std::string op;
std::string data;
std::vector<swss::FieldValueTuple> values;

consumer.pop(op, data, values);

if (&consumer != m_restartCheckNotificationConsumer)
{
return;
}

m_warmRestartCheck.checkRestartReadyState = false;
m_warmRestartCheck.noFreeze = false;
m_warmRestartCheck.skipPendingTaskCheck = false;

SWSS_LOG_NOTICE("RESTARTCHECK notification for %s ", op.c_str());
if (op == "orchagent")
{
string s = op;

m_warmRestartCheck.checkRestartReadyState = true;
for (auto &i : values)
{
s += "|" + fvField(i) + ":" + fvValue(i);

if (fvField(i) == "NoFreeze" && fvValue(i) == "true")
{
m_warmRestartCheck.noFreeze = true;
}
if (fvField(i) == "SkipPendingTaskCheck" && fvValue(i) == "true")
{
m_warmRestartCheck.skipPendingTaskCheck = true;
}
}
SWSS_LOG_NOTICE("%s", s.c_str());
}
}

void SwitchOrch::restartCheckReply(const string &op, const string &data, std::vector<FieldValueTuple> &values)
{
NotificationProducer restartRequestReply(m_db, "RESTARTCHECKREPLY");
restartRequestReply.send(op, data, values);
checkRestartReadyDone();
}
20 changes: 20 additions & 0 deletions orchagent/switchorch.h
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,31 @@

#include "orch.h"

struct WarmRestartCheck
{
bool checkRestartReadyState;
bool noFreeze;
bool skipPendingTaskCheck;
};

class SwitchOrch : public Orch
{
public:
SwitchOrch(DBConnector *db, string tableName);

bool checkRestartReady() { return m_warmRestartCheck.checkRestartReadyState; }
bool checkRestartNoFreeze() { return m_warmRestartCheck.noFreeze; }
bool skipPendingTaskCheck() { return m_warmRestartCheck.skipPendingTaskCheck; }
void checkRestartReadyDone() { m_warmRestartCheck.checkRestartReadyState = false; }
void restartCheckReply(const string &op, const string &data, std::vector<FieldValueTuple> &values);
private:
void doTask(Consumer &consumer);

NotificationConsumer* m_restartCheckNotificationConsumer;
void doTask(NotificationConsumer& consumer);
DBConnector *m_db;

// Information contained in the request from
// external program for orchagent pre-shutdown state check
WarmRestartCheck m_warmRestartCheck = {false, false, false};
};
Loading

0 comments on commit 9fda944

Please sign in to comment.