Skip to content

Commit d823dd1

Browse files
authored
[MirrorOrch]: Mirror Session Retention across Warm Reboot (sonic-net#1054)
* [MirrorOrch]: Mirror Session Retention across Warm Reboot After warm reboot, it is expected that the monitor port of the mirror session is retained - no changing on the monitor port withint the ECMP group members and the LAG members. This is due to the general of the sairedis comparison logic and the minimalization of SAI function calls during reconciliation. Changes: 1. Add bake() and postBake() functions in MirrorOrch bake() function retrieves the state database information and get the VLAN + monitor port information. postBake() function leverages the information and recovers the active mirror sessions the same as before warm reboot. 2. state database format change Instead of storing the object ID of the monitor port, store the alias of the monitor port. Instead of storing true/false of VLAN header, store the VLAN ID. Update: Freeze doTask() function instead of update() function With this update, we could fix potential orchagent issues before the warm reboot when the monitor port was wrongly calculated. Signed-off-by: Shu0T1an ChenG <[email protected]>
1 parent a5b6e7c commit d823dd1

File tree

7 files changed

+204
-33
lines changed

7 files changed

+204
-33
lines changed

orchagent/mirrororch.cpp

+172-24
Original file line numberDiff line numberDiff line change
@@ -10,10 +10,12 @@
1010
#include "swssnet.h"
1111
#include "converter.h"
1212
#include "mirrororch.h"
13+
#include "tokenize.h"
1314

1415
#define MIRROR_SESSION_STATUS "status"
1516
#define MIRROR_SESSION_STATUS_ACTIVE "active"
1617
#define MIRROR_SESSION_STATUS_INACTIVE "inactive"
18+
#define MIRROR_SESSION_NEXT_HOP_IP "next_hop_ip"
1719
#define MIRROR_SESSION_SRC_IP "src_ip"
1820
#define MIRROR_SESSION_DST_IP "dst_ip"
1921
#define MIRROR_SESSION_GRE_TYPE "gre_type"
@@ -23,7 +25,7 @@
2325
#define MIRROR_SESSION_DST_MAC_ADDRESS "dst_mac"
2426
#define MIRROR_SESSION_MONITOR_PORT "monitor_port"
2527
#define MIRROR_SESSION_ROUTE_PREFIX "route_prefix"
26-
#define MIRROR_SESSION_VLAN_HEADER_VALID "vlan_header_valid"
28+
#define MIRROR_SESSION_VLAN_ID "vlan_id"
2729
#define MIRROR_SESSION_POLICER "policer"
2830

2931
#define MIRROR_SESSION_DEFAULT_VLAN_PRI 0
@@ -58,6 +60,7 @@ MirrorEntry::MirrorEntry(const string& platform) :
5860
}
5961

6062
nexthopInfo.prefix = IpPrefix("0.0.0.0/0");
63+
nexthopInfo.nexthop = IpAddress("0.0.0.0");
6164
}
6265

6366
MirrorOrch::MirrorOrch(TableConnector stateDbConnector, TableConnector confDbConnector,
@@ -75,6 +78,74 @@ MirrorOrch::MirrorOrch(TableConnector stateDbConnector, TableConnector confDbCon
7578
m_fdbOrch->attach(this);
7679
}
7780

81+
bool MirrorOrch::bake()
82+
{
83+
SWSS_LOG_ENTER();
84+
85+
// Freeze the route update during orchagent restoration
86+
m_freeze = true;
87+
88+
deque<KeyOpFieldsValuesTuple> entries;
89+
vector<string> keys;
90+
m_mirrorTable.getKeys(keys);
91+
for (const auto &key : keys)
92+
{
93+
vector<FieldValueTuple> tuples;
94+
m_mirrorTable.get(key, tuples);
95+
96+
bool active = false;
97+
string monitor_port;
98+
string next_hop_ip;
99+
100+
for (const auto &tuple : tuples)
101+
{
102+
if (fvField(tuple) == MIRROR_SESSION_STATUS)
103+
{
104+
active = fvValue(tuple) == MIRROR_SESSION_STATUS_ACTIVE;
105+
}
106+
else if (fvField(tuple) == MIRROR_SESSION_MONITOR_PORT)
107+
{
108+
monitor_port = fvValue(tuple);
109+
}
110+
else if (fvField(tuple) == MIRROR_SESSION_NEXT_HOP_IP)
111+
{
112+
next_hop_ip = fvValue(tuple);
113+
}
114+
}
115+
116+
if (!active)
117+
{
118+
continue;
119+
}
120+
121+
SWSS_LOG_NOTICE("Found mirror session %s active before warm reboot",
122+
key.c_str());
123+
124+
// Recover saved active session's monitor port
125+
m_recoverySessionMap.emplace(
126+
key, monitor_port + state_db_key_delimiter + next_hop_ip);
127+
}
128+
129+
return Orch::bake();
130+
}
131+
132+
bool MirrorOrch::postBake()
133+
{
134+
SWSS_LOG_ENTER();
135+
136+
SWSS_LOG_NOTICE("Start MirrorOrch post-baking");
137+
138+
// Unfreeze the route update
139+
m_freeze = false;
140+
141+
Orch::doTask();
142+
143+
// Clean up the recovery cache
144+
m_recoverySessionMap.clear();
145+
146+
return Orch::postBake();
147+
}
148+
78149
void MirrorOrch::update(SubjectType type, void *cntx)
79150
{
80151
SWSS_LOG_ENTER();
@@ -320,10 +391,11 @@ void MirrorOrch::setSessionState(const string& name, const MirrorEntry& session,
320391
{
321392
SWSS_LOG_ENTER();
322393

323-
SWSS_LOG_INFO("Setting mirroring sessions %s state\n", name.c_str());
394+
SWSS_LOG_INFO("Update mirroring sessions %s state", name.c_str());
324395

325396
vector<FieldValueTuple> fvVector;
326397
string value;
398+
327399
if (attr.empty() || attr == MIRROR_SESSION_STATUS)
328400
{
329401
value = session.status ? MIRROR_SESSION_STATUS_ACTIVE : MIRROR_SESSION_STATUS_INACTIVE;
@@ -332,8 +404,9 @@ void MirrorOrch::setSessionState(const string& name, const MirrorEntry& session,
332404

333405
if (attr.empty() || attr == MIRROR_SESSION_MONITOR_PORT)
334406
{
335-
value = sai_serialize_object_id(session.neighborInfo.portId);
336-
fvVector.emplace_back(MIRROR_SESSION_MONITOR_PORT, value);
407+
Port port;
408+
m_portsOrch->getPort(session.neighborInfo.portId, port);
409+
fvVector.emplace_back(MIRROR_SESSION_MONITOR_PORT, port.m_alias);
337410
}
338411

339412
if (attr.empty() || attr == MIRROR_SESSION_DST_MAC_ADDRESS)
@@ -348,10 +421,16 @@ void MirrorOrch::setSessionState(const string& name, const MirrorEntry& session,
348421
fvVector.emplace_back(MIRROR_SESSION_ROUTE_PREFIX, value);
349422
}
350423

351-
if (attr.empty() || attr == MIRROR_SESSION_VLAN_HEADER_VALID)
424+
if (attr.empty() || attr == MIRROR_SESSION_VLAN_ID)
425+
{
426+
value = to_string(session.neighborInfo.port.m_vlan_info.vlan_id);
427+
fvVector.emplace_back(MIRROR_SESSION_VLAN_ID, value);
428+
}
429+
430+
if (attr.empty() || attr == MIRROR_SESSION_NEXT_HOP_IP)
352431
{
353-
value = to_string(session.neighborInfo.port.m_type == Port::VLAN);
354-
fvVector.emplace_back(MIRROR_SESSION_VLAN_HEADER_VALID, value);
432+
value = session.nexthopInfo.nexthop.to_string();
433+
fvVector.emplace_back(MIRROR_SESSION_NEXT_HOP_IP, value);
355434
}
356435

357436
m_mirrorTable.set(name, fvVector);
@@ -396,32 +475,68 @@ bool MirrorOrch::getNeighborInfo(const string& name, MirrorEntry& session)
396475
return false;
397476
}
398477

399-
// Get the firt member of the LAG
400-
Port member;
401-
const auto& first_member_alias = *session.neighborInfo.port.m_members.begin();
402-
m_portsOrch->getPort(first_member_alias, member);
478+
// Recover the LAG member monitor port picked before warm reboot
479+
// to minimalize the data plane changes across warm reboot.
480+
if (m_recoverySessionMap.find(name) != m_recoverySessionMap.end())
481+
{
482+
string alias = tokenize(m_recoverySessionMap[name],
483+
state_db_key_delimiter, 1)[0];
484+
Port member;
485+
m_portsOrch->getPort(alias, member);
486+
487+
SWSS_LOG_NOTICE("Recover mirror session %s with LAG member port %s",
488+
name.c_str(), alias.c_str());
489+
session.neighborInfo.portId = member.m_port_id;
490+
}
491+
else
492+
{
493+
// Get the firt member of the LAG
494+
Port member;
495+
string first_member_alias = *session.neighborInfo.port.m_members.begin();
496+
m_portsOrch->getPort(first_member_alias, member);
497+
498+
session.neighborInfo.portId = member.m_port_id;
499+
}
403500

404-
session.neighborInfo.portId = member.m_port_id;
405501
return true;
406502
}
407503
case Port::VLAN:
408504
{
409505
SWSS_LOG_NOTICE("Get mirror session destination IP neighbor VLAN %d",
410506
session.neighborInfo.port.m_vlan_info.vlan_id);
411-
Port member;
412-
if (!m_fdbOrch->getPort(session.neighborInfo.mac, session.neighborInfo.port.m_vlan_info.vlan_id, member))
507+
508+
// Recover the VLAN member monitor port picked before warm reboot
509+
// since the FDB entries are not yet learned on the hardware
510+
if (m_recoverySessionMap.find(name) != m_recoverySessionMap.end())
413511
{
414-
SWSS_LOG_NOTICE("Waiting to get FDB entry MAC %s under VLAN %s",
415-
session.neighborInfo.mac.to_string().c_str(),
416-
session.neighborInfo.port.m_alias.c_str());
417-
return false;
512+
string alias = tokenize(m_recoverySessionMap[name],
513+
state_db_key_delimiter, 1)[0];
514+
Port member;
515+
m_portsOrch->getPort(alias, member);
516+
517+
SWSS_LOG_NOTICE("Recover mirror session %s with VLAN member port %s",
518+
name.c_str(), alias.c_str());
519+
session.neighborInfo.portId = member.m_port_id;
418520
}
419521
else
420522
{
421-
// Update monitor port
422-
session.neighborInfo.portId = member.m_port_id;
423-
return true;
523+
Port member;
524+
if (!m_fdbOrch->getPort(session.neighborInfo.mac,
525+
session.neighborInfo.port.m_vlan_info.vlan_id, member))
526+
{
527+
SWSS_LOG_NOTICE("Waiting to get FDB entry MAC %s under VLAN %s",
528+
session.neighborInfo.mac.to_string().c_str(),
529+
session.neighborInfo.port.m_alias.c_str());
530+
return false;
531+
}
532+
else
533+
{
534+
// Update monitor port
535+
session.neighborInfo.portId = member.m_port_id;
536+
}
424537
}
538+
539+
return true;
425540
}
426541
default:
427542
{
@@ -741,7 +856,7 @@ bool MirrorOrch::updateSessionType(const string& name, MirrorEntry& session)
741856
SWSS_LOG_NOTICE("Update mirror session %s VLAN to %s",
742857
name.c_str(), session.neighborInfo.port.m_alias.c_str());
743858

744-
setSessionState(name, session, MIRROR_SESSION_VLAN_HEADER_VALID);
859+
setSessionState(name, session, MIRROR_SESSION_VLAN_ID);
745860

746861
return true;
747862
}
@@ -782,7 +897,35 @@ void MirrorOrch::updateNextHop(const NextHopUpdate& update)
782897

783898
if (update.nexthopGroup != IpAddresses())
784899
{
785-
session.nexthopInfo.nexthop = *update.nexthopGroup.getIpAddresses().begin();
900+
SWSS_LOG_NOTICE(" next hop IPs: %s", update.nexthopGroup.to_string().c_str());
901+
902+
// Recover the session based on the state database information
903+
if (m_recoverySessionMap.find(name) != m_recoverySessionMap.end())
904+
{
905+
IpAddress nexthop = IpAddress(tokenize(m_recoverySessionMap[name],
906+
state_db_key_delimiter, 1)[1]);
907+
908+
// Check if recovered next hop IP is within the update's next hop IPs
909+
if (update.nexthopGroup.getIpAddresses().count(nexthop))
910+
{
911+
SWSS_LOG_NOTICE("Recover mirror session %s with next hop %s",
912+
name.c_str(), nexthop.to_string().c_str());
913+
session.nexthopInfo.nexthop = nexthop;
914+
}
915+
else
916+
{
917+
// Correct the next hop IP
918+
SWSS_LOG_NOTICE("Correct mirror session %s next hop from %s to %s",
919+
name.c_str(), session.nexthopInfo.nexthop.to_string().c_str(),
920+
nexthop.to_string().c_str());
921+
session.nexthopInfo.nexthop = *update.nexthopGroup.getIpAddresses().begin();
922+
}
923+
}
924+
else
925+
{
926+
// Pick the first one from the next hop group
927+
session.nexthopInfo.nexthop = *update.nexthopGroup.getIpAddresses().begin();
928+
}
786929
}
787930
else
788931
{
@@ -968,6 +1111,11 @@ void MirrorOrch::doTask(Consumer& consumer)
9681111
{
9691112
SWSS_LOG_ENTER();
9701113

1114+
if (m_freeze)
1115+
{
1116+
return;
1117+
}
1118+
9711119
if (!gPortsOrch->allPortsReady())
9721120
{
9731121
return;
@@ -991,7 +1139,7 @@ void MirrorOrch::doTask(Consumer& consumer)
9911139
}
9921140
else
9931141
{
994-
SWSS_LOG_ERROR("Unknown operation type %s\n", op.c_str());
1142+
SWSS_LOG_ERROR("Unknown operation type %s", op.c_str());
9951143
}
9961144

9971145
consumer.m_toSync.erase(it++);

orchagent/mirrororch.h

+6
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,8 @@ class MirrorOrch : public Orch, public Observer, public Subject
6969
MirrorOrch(TableConnector appDbConnector, TableConnector confDbConnector,
7070
PortsOrch *portOrch, RouteOrch *routeOrch, NeighOrch *neighOrch, FdbOrch *fdbOrch, PolicerOrch *policerOrch);
7171

72+
bool bake() override;
73+
bool postBake() override;
7274
void update(SubjectType, void *);
7375
bool sessionExists(const string&);
7476
bool getSessionStatus(const string&, bool&);
@@ -86,6 +88,10 @@ class MirrorOrch : public Orch, public Observer, public Subject
8688
Table m_mirrorTable;
8789

8890
MirrorTable m_syncdMirrors;
91+
// session_name -> VLAN | monitor_port_alias | next_hop_ip
92+
map<string, string> m_recoverySessionMap;
93+
94+
bool m_freeze = false;
8995

9096
void createEntry(const string&, const vector<FieldValueTuple>&);
9197
void deleteEntry(const string&);

orchagent/orch.cpp

+13-6
Original file line numberDiff line numberDiff line change
@@ -27,15 +27,15 @@ Orch::Orch(DBConnector *db, const string tableName, int pri)
2727

2828
Orch::Orch(DBConnector *db, const vector<string> &tableNames)
2929
{
30-
for(auto it : tableNames)
30+
for (auto it : tableNames)
3131
{
3232
addConsumer(db, it, default_orch_pri);
3333
}
3434
}
3535

3636
Orch::Orch(DBConnector *db, const vector<table_name_with_pri_t> &tableNames_with_pri)
3737
{
38-
for(const auto& it : tableNames_with_pri)
38+
for (const auto& it : tableNames_with_pri)
3939
{
4040
addConsumer(db, it.first, it.second);
4141
}
@@ -60,7 +60,7 @@ Orch::~Orch()
6060
vector<Selectable *> Orch::getSelectables()
6161
{
6262
vector<Selectable *> selectables;
63-
for(auto& it : m_consumerMap)
63+
for (auto& it : m_consumerMap)
6464
{
6565
selectables.push_back(it.second.get());
6666
}
@@ -240,7 +240,7 @@ bool Orch::bake()
240240
{
241241
SWSS_LOG_ENTER();
242242

243-
for(auto &it : m_consumerMap)
243+
for (auto &it : m_consumerMap)
244244
{
245245
string executorName = it.first;
246246
auto executor = it.second;
@@ -257,6 +257,13 @@ bool Orch::bake()
257257
return true;
258258
}
259259

260+
bool Orch::postBake()
261+
{
262+
SWSS_LOG_ENTER();
263+
264+
return true;
265+
}
266+
260267
/*
261268
- Validates reference has proper format which is [table_name:object_name]
262269
- validates table_name exists
@@ -365,15 +372,15 @@ ref_resolve_status Orch::resolveFieldRefValue(
365372

366373
void Orch::doTask()
367374
{
368-
for(auto &it : m_consumerMap)
375+
for (auto &it : m_consumerMap)
369376
{
370377
it.second->drain();
371378
}
372379
}
373380

374381
void Orch::dumpPendingTasks(vector<string> &ts)
375382
{
376-
for(auto &it : m_consumerMap)
383+
for (auto &it : m_consumerMap)
377384
{
378385
Consumer* consumer = dynamic_cast<Consumer *>(it.second.get());
379386
if (consumer == NULL)

orchagent/orch.h

+2
Original file line numberDiff line numberDiff line change
@@ -183,6 +183,8 @@ class Orch
183183
// Prepare for warm start if Redis contains valid input data
184184
// otherwise fallback to cold start
185185
virtual bool bake();
186+
// Clean up the state set in bake()
187+
virtual bool postBake();
186188

187189
/* Iterate all consumers in m_consumerMap and run doTask(Consumer) */
188190
virtual void doTask();

0 commit comments

Comments
 (0)