Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add RESTARTCHECK failed debug logs (Do not upstream) #10

Open
wants to merge 2 commits into
base: 202405
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 17 additions & 1 deletion orchagent/orchagent_restart_check.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -125,14 +125,29 @@ int main(int argc, char **argv)
while (retries <= retryCount)
{
SWSS_LOG_NOTICE("requested %s to do warm restart state check, retry count: %d", op.c_str(), retries);
restartQuery.send(op, op, values);
SWSS_LOG_NOTICE("--- RESTARTCHECK_failed_debug --- orchagent_restart_check --- retry count: %d", retries);

//restartQuery.send(op, op, values);
SWSS_LOG_NOTICE("--- RESTARTCHECK_failed_debug --- orchagent_restart_check --- before send");
long long send_res = restartQuery.send(op, op, values);
SWSS_LOG_NOTICE("--- RESTARTCHECK_failed_debug --- orchagent_restart_check --- send resulst = %lld", send_res);
SWSS_LOG_NOTICE("--- RESTARTCHECK_failed_debug --- orchagent_restart_check --- after send");

std::string op_ret, data;
std::vector<swss::FieldValueTuple> values_ret;

SWSS_LOG_NOTICE("--- RESTARTCHECK_failed_debug --- orchagent_restart_check --- before select");
int result = s.select(&sel, waitTime);
SWSS_LOG_NOTICE("--- RESTARTCHECK_failed_debug --- orchagent_restart_check --- select resulst = %d", result);
SWSS_LOG_NOTICE("--- RESTARTCHECK_failed_debug --- orchagent_restart_check --- after select");

if (result == swss::Select::OBJECT)
{
SWSS_LOG_NOTICE("--- RESTARTCHECK_failed_debug --- orchagent_restart_check --- before pop");
restartQueryReply.pop(op_ret, data, values_ret);
SWSS_LOG_NOTICE("--- RESTARTCHECK_failed_debug --- orchagent_restart_check --- after after");

SWSS_LOG_NOTICE("--- RESTARTCHECK_failed_debug --- orchagent_restart_check --- data = %s , op_ret = %s", data.c_str(), op_ret.c_str());
if (data == "READY")
{
SWSS_LOG_NOTICE("RESTARTCHECK success, %s is frozen and ready for warm restart", op_ret.c_str());
Expand All @@ -147,6 +162,7 @@ int main(int argc, char **argv)
}
else if (result == swss::Select::TIMEOUT)
{
SWSS_LOG_NOTICE("--- RESTARTCHECK_failed_debug --- orchagent_restart_check --- TIMED OUT");
SWSS_LOG_NOTICE("RESTARTCHECK for %s timed out", op_ret.c_str());
}
else
Expand Down
28 changes: 28 additions & 0 deletions orchagent/orchdaemon.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -815,6 +815,7 @@ void OrchDaemon::logRotate() {
void OrchDaemon::start()
{
SWSS_LOG_ENTER();
SWSS_LOG_NOTICE("--- RESTARTCHECK_failed_debug --- OrchDaemon::start --- Entered function OrchDaemon::start");

Recorder::Instance().sairedis.setRotate(false);

Expand All @@ -825,6 +826,7 @@ void OrchDaemon::start()

auto tstart = std::chrono::high_resolution_clock::now();

SWSS_LOG_NOTICE("--- RESTARTCHECK_failed_debug --- OrchDaemon::start --- Starting infinite loop while(true)");
while (true)
{
Selectable *s;
Expand Down Expand Up @@ -879,38 +881,56 @@ void OrchDaemon::start()
for (Orch *o : m_orchList)
o->doTask();

SWSS_LOG_NOTICE("--- RESTARTCHECK_failed_debug --- OrchDaemon::start --- Before if(checkRestartReady)");
/*
* Asked to check warm restart readiness.
* Not doing this under Select::TIMEOUT condition because of
* the existence of finer granularity ExecutableTimer with select
*/
if (gSwitchOrch && gSwitchOrch->checkRestartReady())
{
SWSS_LOG_NOTICE("--- RESTARTCHECK_failed_debug --- OrchDaemon::start --- entered if (gSwitchOrch && gSwitchOrch->checkRestartReady())");
SWSS_LOG_NOTICE("--- RESTARTCHECK_failed_debug --- OrchDaemon::start --- before call to warmRestartCheck()");
bool ret = warmRestartCheck();
SWSS_LOG_NOTICE("--- RESTARTCHECK_failed_debug --- OrchDaemon::start --- after call to warmRestartCheck()");

SWSS_LOG_NOTICE("--- RESTARTCHECK_failed_debug --- OrchDaemon::start --- ret = %d", ret);
if (ret)
{
SWSS_LOG_NOTICE("--- RESTARTCHECK_failed_debug --- OrchDaemon::start --- entered if (ret)");
// Orchagent is ready to perform warm restart, stop processing any new db data.
// Should sleep here or continue handling timers and etc.??
if (!gSwitchOrch->checkRestartNoFreeze())
{
SWSS_LOG_NOTICE("--- RESTARTCHECK_failed_debug --- OrchDaemon::start --- entered if (!gSwitchOrch->checkRestartNoFreeze())");
// Disable FDB aging
SWSS_LOG_NOTICE("--- RESTARTCHECK_failed_debug --- OrchDaemon::start --- before gSwitchOrch->setAgingFDB(0);");
gSwitchOrch->setAgingFDB(0);
SWSS_LOG_NOTICE("--- RESTARTCHECK_failed_debug --- OrchDaemon::start --- after gSwitchOrch->setAgingFDB(0);");

// Disable FDB learning on all bridge ports
if (gPortsOrch)
{
SWSS_LOG_NOTICE("--- RESTARTCHECK_failed_debug --- OrchDaemon::start --- entered if (gPortsOrch)");
for (auto& pair: gPortsOrch->getAllPorts())
{
auto& port = pair.second;
SWSS_LOG_NOTICE("--- RESTARTCHECK_failed_debug --- OrchDaemon::start --- before gPortsOrch->setBridgePortLearningFDB(port, SAI_BRIDGE_PORT_FDB_LEARNING_MODE_DISABLE)");
gPortsOrch->setBridgePortLearningFDB(port, SAI_BRIDGE_PORT_FDB_LEARNING_MODE_DISABLE);
SWSS_LOG_NOTICE("--- RESTARTCHECK_failed_debug --- OrchDaemon::start --- after gPortsOrch->setBridgePortLearningFDB(port, SAI_BRIDGE_PORT_FDB_LEARNING_MODE_DISABLE)");
}
}

// Flush sairedis's redis pipeline
SWSS_LOG_NOTICE("--- RESTARTCHECK_failed_debug --- OrchDaemon::start --- before flush();");
flush();
SWSS_LOG_NOTICE("--- RESTARTCHECK_failed_debug --- OrchDaemon::start --- after flush();");

SWSS_LOG_WARN("Orchagent is frozen for warm restart!");
SWSS_LOG_NOTICE("--- RESTARTCHECK_failed_debug --- OrchDaemon::start --- Orchagent is frozen for warm restart!");
SWSS_LOG_NOTICE("--- RESTARTCHECK_failed_debug --- OrchDaemon::start --- before freezeAndHeartBeat(UINT_MAX);");
freezeAndHeartBeat(UINT_MAX);
SWSS_LOG_NOTICE("--- RESTARTCHECK_failed_debug --- OrchDaemon::start --- after freezeAndHeartBeat(UINT_MAX);");
}
}
}
Expand Down Expand Up @@ -1031,6 +1051,7 @@ bool OrchDaemon::warmRestoreValidation()
*/
bool OrchDaemon::warmRestartCheck()
{
SWSS_LOG_NOTICE("--- RESTARTCHECK_failed_debug --- OrchDaemon::warmRestartCheck --- Entered OrchDaemon::warmRestartCheck");
std::vector<swss::FieldValueTuple> values;
std::string op = "orchagent";
std::string data = "READY";
Expand All @@ -1041,23 +1062,30 @@ bool OrchDaemon::warmRestartCheck()

if (ts.size() != 0)
{
SWSS_LOG_NOTICE("--- RESTARTCHECK_failed_debug --- OrchDaemon::warmRestartCheck --- Entered if (ts.size() != 0)");
SWSS_LOG_NOTICE("WarmRestart check found pending tasks: ");
SWSS_LOG_NOTICE("--- RESTARTCHECK_failed_debug --- OrchDaemon::warmRestartCheck --- WarmRestart check found pending tasks: ");
for(auto &s : ts)
{
SWSS_LOG_NOTICE("--- RESTARTCHECK_failed_debug --- OrchDaemon::warmRestartCheck --- %s", s.c_str());
SWSS_LOG_NOTICE(" %s", s.c_str());
}
if (!gSwitchOrch->skipPendingTaskCheck())
{
SWSS_LOG_NOTICE("--- RESTARTCHECK_failed_debug --- OrchDaemon::warmRestartCheck --- Entered if (!gSwitchOrch->skipPendingTaskCheck())");
SWSS_LOG_NOTICE("--- RESTARTCHECK_failed_debug --- OrchDaemon::warmRestartCheck --- setting data = \"NOT_READY\", ret = false");
data = "NOT_READY";
ret = false;
}
else
{
SWSS_LOG_NOTICE("--- RESTARTCHECK_failed_debug --- OrchDaemon::warmRestartCheck --- Orchagent objects dependency check skipped");
SWSS_LOG_NOTICE("Orchagent objects dependency check skipped");
}
}

SWSS_LOG_NOTICE("Restart check result: %s", data.c_str());
SWSS_LOG_NOTICE("--- RESTARTCHECK_failed_debug --- OrchDaemon::warmRestartCheck --- Restart check result: data = %s", data.c_str());
gSwitchOrch->restartCheckReply(op, data, values);
return ret;
}
Expand Down
22 changes: 22 additions & 0 deletions orchagent/switchorch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,7 @@ SwitchOrch::SwitchOrch(DBConnector *db, vector<TableConnector>& connectors, Tabl
m_stateDbForNotification(new DBConnector(STATE_DB, DBConnector::DEFAULT_UNIXSOCKET, 0)),
m_asicSdkHealthEventTable(new Table(m_stateDbForNotification.get(), STATE_ASIC_SDK_HEALTH_EVENT_TABLE_NAME))
{
SWSS_LOG_NOTICE("--- RESTARTCHECK_failed_debug --- SwitchOrch::SwitchOrch --- Entered SwitchOrch::SwitchOrch");
m_restartCheckNotificationConsumer = new NotificationConsumer(db, "RESTARTCHECK");
auto restartCheckNotifier = new Notifier(m_restartCheckNotificationConsumer, this, "RESTARTCHECK");
Orch::addExecutor(restartCheckNotifier);
Expand Down Expand Up @@ -1057,6 +1058,7 @@ void SwitchOrch::doTask(Consumer &consumer)

void SwitchOrch::doTask(NotificationConsumer& consumer)
{
SWSS_LOG_NOTICE("--- RESTARTCHECK_failed_debug --- SwitchOrch::doTask --- Entered function SwitchOrch::doTask");
SWSS_LOG_ENTER();

std::string op;
Expand All @@ -1065,8 +1067,13 @@ void SwitchOrch::doTask(NotificationConsumer& consumer)

consumer.pop(op, data, values);

SWSS_LOG_NOTICE("--- RESTARTCHECK_failed_debug --- SwitchOrch::doTask --- consumer.pop(op, data, values)");
SWSS_LOG_NOTICE("--- RESTARTCHECK_failed_debug --- SwitchOrch::doTask --- op = %s, data = %s", op.c_str(), data.c_str());

if (&consumer != m_restartCheckNotificationConsumer)
{
SWSS_LOG_NOTICE("--- RESTARTCHECK_failed_debug --- SwitchOrch::doTask --- Entered if (&consumer != m_restartCheckNotificationConsumer)");
SWSS_LOG_NOTICE("--- RESTARTCHECK_failed_debug --- SwitchOrch::doTask --- Returning");
return;
}

Expand All @@ -1077,31 +1084,46 @@ void SwitchOrch::doTask(NotificationConsumer& consumer)
SWSS_LOG_NOTICE("RESTARTCHECK notification for %s ", op.c_str());
if (op == "orchagent")
{
SWSS_LOG_NOTICE("--- RESTARTCHECK_failed_debug --- SwitchOrch::doTask --- Entered if (op == \"orchagent\")");
string s = op;

m_warmRestartCheck.checkRestartReadyState = true;
SWSS_LOG_NOTICE("--- RESTARTCHECK_failed_debug --- SwitchOrch::doTask --- setting m_warmRestartCheck.checkRestartReadyState = true");

SWSS_LOG_NOTICE("--- RESTARTCHECK_failed_debug --- SwitchOrch::doTask --- Going over values vector:");
for (auto &i : values)
{
SWSS_LOG_NOTICE("--- RESTARTCHECK_failed_debug --- SwitchOrch::doTask --- <%s,%s>", fvField(i).c_str(), fvValue(i).c_str());
s += "|" + fvField(i) + ":" + fvValue(i);

if (fvField(i) == "NoFreeze" && fvValue(i) == "true")
{
SWSS_LOG_NOTICE("--- RESTARTCHECK_failed_debug --- SwitchOrch::doTask --- setting m_warmRestartCheck.noFreeze = true;");
m_warmRestartCheck.noFreeze = true;
}
if (fvField(i) == "SkipPendingTaskCheck" && fvValue(i) == "true")
{
SWSS_LOG_NOTICE("--- RESTARTCHECK_failed_debug --- SwitchOrch::doTask --- setting m_warmRestartCheck.skipPendingTaskCheck = true;");
m_warmRestartCheck.skipPendingTaskCheck = true;
}
}
SWSS_LOG_NOTICE("--- RESTARTCHECK_failed_debug --- SwitchOrch::doTask --- final s = %s", s.c_str());
SWSS_LOG_NOTICE("%s", s.c_str());
}
}

void SwitchOrch::restartCheckReply(const string &op, const string &data, std::vector<FieldValueTuple> &values)
{
SWSS_LOG_NOTICE("--- RESTARTCHECK_failed_debug --- SwitchOrch::restartCheckReply --- Entered function SwitchOrch::restartCheckReply");
NotificationProducer restartRequestReply(m_db, "RESTARTCHECKREPLY");
SWSS_LOG_NOTICE("--- RESTARTCHECK_failed_debug --- SwitchOrch::restartCheckReply --- Before restartRequestReply.send(op, data, values)");
restartRequestReply.send(op, data, values);
SWSS_LOG_NOTICE("--- RESTARTCHECK_failed_debug --- SwitchOrch::restartCheckReply --- After restartRequestReply.send(op, data, values)");
SWSS_LOG_NOTICE("--- RESTARTCHECK_failed_debug --- SwitchOrch::restartCheckReply --- before checkRestartReadyDone();");
SWSS_LOG_NOTICE("--- RESTARTCHECK_failed_debug --- SwitchOrch::restartCheckReply --- setting: m_warmRestartCheck.checkRestartReadyState = false;");
checkRestartReadyDone();
SWSS_LOG_NOTICE("--- RESTARTCHECK_failed_debug --- SwitchOrch::restartCheckReply --- after checkRestartReadyDone();");
SWSS_LOG_NOTICE("--- RESTARTCHECK_failed_debug --- SwitchOrch::restartCheckReply --- Leaving function restartCheckReply");
}

void SwitchOrch::onSwitchAsicSdkHealthEvent(sai_object_id_t switch_id,
Expand Down
Loading