Skip to content

Commit 313ef5c

Browse files
authored
Warmboot Vlan neigh restore fix (#1040)
* Send arp request after first Vlan member port is added * Add wait logic after Vlan member add, nbrmgr to wait for restore complete * Address comment to pass db as a parameter and open only once
1 parent 5841e06 commit 313ef5c

File tree

4 files changed

+84
-12
lines changed

4 files changed

+84
-12
lines changed

cfgmgr/nbrmgr.cpp

+15-1
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,8 @@ NbrMgr::NbrMgr(DBConnector *cfgDb, DBConnector *appDb, DBConnector *stateDb, con
4747
m_statePortTable(stateDb, STATE_PORT_TABLE_NAME),
4848
m_stateLagTable(stateDb, STATE_LAG_TABLE_NAME),
4949
m_stateVlanTable(stateDb, STATE_VLAN_TABLE_NAME),
50-
m_stateIntfTable(stateDb, STATE_INTERFACE_TABLE_NAME)
50+
m_stateIntfTable(stateDb, STATE_INTERFACE_TABLE_NAME),
51+
m_stateNeighRestoreTable(stateDb, STATE_NEIGH_RESTORE_TABLE_NAME)
5152
{
5253
int err = 0;
5354

@@ -91,6 +92,19 @@ bool NbrMgr::isIntfStateOk(const string &alias)
9192
return false;
9293
}
9394

95+
bool NbrMgr::isNeighRestoreDone()
96+
{
97+
string value;
98+
99+
m_stateNeighRestoreTable.hget("Flags", "restored", value);
100+
if (value == "true")
101+
{
102+
SWSS_LOG_INFO("Kernel neighbor table restore is done");
103+
return true;
104+
}
105+
return false;
106+
}
107+
94108
bool NbrMgr::setNeighbor(const string& alias, const IpAddress& ip, const MacAddress& mac)
95109
{
96110
SWSS_LOG_ENTER();

cfgmgr/nbrmgr.h

+3-1
Original file line numberDiff line numberDiff line change
@@ -20,13 +20,15 @@ class NbrMgr : public Orch
2020
NbrMgr(DBConnector *cfgDb, DBConnector *appDb, DBConnector *stateDb, const vector<string> &tableNames);
2121
using Orch::doTask;
2222

23+
bool isNeighRestoreDone();
24+
2325
private:
2426
bool isIntfStateOk(const string &alias);
2527
bool setNeighbor(const string& alias, const IpAddress& ip, const MacAddress& mac);
2628

2729
void doTask(Consumer &consumer);
2830

29-
Table m_statePortTable, m_stateLagTable, m_stateVlanTable, m_stateIntfTable;
31+
Table m_statePortTable, m_stateLagTable, m_stateVlanTable, m_stateIntfTable, m_stateNeighRestoreTable;
3032
struct nl_sock *m_nl_sock;
3133
};
3234

cfgmgr/nbrmgrd.cpp

+19
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
#include <mutex>
44
#include <fstream>
55
#include <iostream>
6+
#include <chrono>
67

78
#include "select.h"
89
#include "exec.h"
@@ -12,6 +13,9 @@
1213
using namespace std;
1314
using namespace swss;
1415

16+
#define RESTORE_NEIGH_WAIT_TIME_OUT 120
17+
#define RESTORE_NEIGH_WAIT_TIME_INT 10
18+
1519
/* select() function timeout retry time, in millisecond */
1620
#define SELECT_TIMEOUT 1000
1721

@@ -50,6 +54,21 @@ int main(int argc, char **argv)
5054

5155
NbrMgr nbrmgr(&cfgDb, &appDb, &stateDb, cfg_nbr_tables);
5256

57+
chrono::steady_clock::time_point starttime = chrono::steady_clock::now();
58+
while (!nbrmgr.isNeighRestoreDone())
59+
{
60+
chrono::duration<double> time_span = chrono::duration_cast<chrono::duration<double>>
61+
(chrono::steady_clock::now() - starttime);
62+
int pasttime = int(time_span.count());
63+
SWSS_LOG_INFO("Kernel neighbor table restoration waited for %d seconds", pasttime);
64+
if (pasttime > RESTORE_NEIGH_WAIT_TIME_OUT)
65+
{
66+
SWSS_LOG_WARN("Kernel neighbor table restore is not finished!");
67+
break;
68+
}
69+
sleep(RESTORE_NEIGH_WAIT_TIME_INT);
70+
}
71+
5372
std::vector<Orch *> cfgOrchList = {&nbrmgr};
5473

5574
swss::Select s;

neighsyncd/restore_neighbors.py

+47-10
Original file line numberDiff line numberDiff line change
@@ -25,11 +25,29 @@
2525
from scapy.all import conf, in6_getnsma, inet_pton, inet_ntop, in6_getnsmac, get_if_hwaddr, Ether, ARP, IPv6, ICMPv6ND_NS, ICMPv6NDOptSrcLLAddr
2626
from swsscommon import swsscommon
2727
import errno
28+
import syslog
2829

2930
logger = logging.getLogger(__name__)
3031
logger.setLevel(logging.WARNING)
3132
logger.addHandler(logging.NullHandler())
3233

34+
SYSLOG_IDENTIFIER = 'restore_neighbor'
35+
36+
def log_info(msg):
37+
syslog.openlog(SYSLOG_IDENTIFIER)
38+
syslog.syslog(syslog.LOG_INFO, msg)
39+
syslog.closelog()
40+
41+
def log_warning(msg):
42+
syslog.openlog(SYSLOG_IDENTIFIER)
43+
syslog.syslog(syslog.LOG_WARNING, msg)
44+
syslog.closelog()
45+
46+
def log_error(msg):
47+
syslog.openlog(SYSLOG_IDENTIFIER)
48+
syslog.syslog(syslog.LOG_ERR, msg)
49+
syslog.closelog()
50+
3351
# timeout the restore process in 110 seconds if not finished
3452
# This is mostly to wait for interfaces to be created and up after system warm-reboot
3553
# and this process is started by supervisord in swss docker.
@@ -58,12 +76,27 @@ def is_intf_oper_state_up(intf):
5876
state_file = open(oper_file.format(intf), 'r')
5977
state = state_file.readline().rstrip()
6078
except Exception as e:
61-
logger.info('Error: {}'.format(str(e)))
79+
log_info('Error: {}'.format(str(e)))
6280
return False
6381
if state == '1':
6482
return True
6583
return False
6684

85+
def is_intf_up(intf, db):
86+
if not is_intf_oper_state_up(intf):
87+
return False
88+
if 'Vlan' in intf:
89+
table_name = 'VLAN_MEMBER_TABLE|{}|*'.format(intf)
90+
key = db.keys(db.STATE_DB, table_name)
91+
if key is None:
92+
log_info ("Vlan member is not yet created")
93+
return False
94+
if is_intf_up.counter == 0:
95+
time.sleep(3*CHECK_INTERVAL)
96+
is_intf_up.counter = 1
97+
log_info ("intf {} is up".format(intf))
98+
return True
99+
67100
# read the neigh table from AppDB to memory, format as below
68101
# build map as below, this can efficiently access intf and family groups later
69102
# { intf1 -> { { family1 -> [[ip1, mac1], [ip2, mac2] ...] }
@@ -131,7 +164,7 @@ def read_neigh_table_to_maps():
131164

132165
# Use netlink to set neigh table into kernel, not overwrite the existing ones
133166
def set_neigh_in_kernel(ipclass, family, intf_idx, dst_ip, dmac):
134-
logging.info('Add neighbor entries: family: {}, intf_idx: {}, ip: {}, mac: {}'.format(
167+
log_info('Add neighbor entries: family: {}, intf_idx: {}, ip: {}, mac: {}'.format(
135168
family, intf_idx, dst_ip, dmac))
136169

137170
if family not in ip_family:
@@ -152,7 +185,7 @@ def set_neigh_in_kernel(ipclass, family, intf_idx, dst_ip, dmac):
152185
# If neigh exists, log it but no exception raise, other exceptions, raise
153186
except NetlinkError as e:
154187
if e[0] == errno.EEXIST:
155-
logger.warning('Neigh exists in kernel with family: {}, intf_idx: {}, ip: {}, mac: {}'.format(
188+
log_warning('Neigh exists in kernel with family: {}, intf_idx: {}, ip: {}, mac: {}'.format(
156189
family, intf_idx, dst_ip, dmac))
157190
else:
158191
raise
@@ -196,10 +229,13 @@ def restore_update_kernel_neighbors(intf_neigh_map, timeout=DEF_TIME_OUT):
196229
ipclass = IPRoute()
197230
mtime = monotonic.time.time
198231
start_time = mtime()
232+
is_intf_up.counter = 0
233+
db = swsssdk.SonicV2Connector(host='127.0.0.1')
234+
db.connect(db.STATE_DB, False)
199235
while (mtime() - start_time) < timeout:
200236
for intf, family_neigh_map in intf_neigh_map.items():
201237
# only try to restore to kernel when link is up
202-
if is_intf_oper_state_up(intf):
238+
if is_intf_up(intf, db):
203239
src_mac = get_if_hwaddr(intf)
204240
intf_idx = ipclass.link_lookup(ifname=intf)[0]
205241
# create socket per intf to send packets
@@ -215,6 +251,8 @@ def restore_update_kernel_neighbors(intf_neigh_map, timeout=DEF_TIME_OUT):
215251
# use netlink to set neighbor entries
216252
set_neigh_in_kernel(ipclass, family, intf_idx, dst_ip, dmac)
217253

254+
log_info('Sending Neigh with family: {}, intf_idx: {}, ip: {}, mac: {}'.format(
255+
family, intf_idx, dst_ip, dmac))
218256
# sending arp/ns packet to update kernel neigh info
219257
s.send(build_arp_ns_pkt(family, src_mac, src_ip, dst_ip))
220258
# delete this family on the intf
@@ -229,28 +267,27 @@ def restore_update_kernel_neighbors(intf_neigh_map, timeout=DEF_TIME_OUT):
229267
if not intf_neigh_map:
230268
break
231269
time.sleep(CHECK_INTERVAL)
270+
db.close(db.STATE_DB)
232271

233272

234273
def main():
235274

236-
print "restore_neighbors service is started"
237-
275+
log_info ("restore_neighbors service is started")
238276
# Use warmstart python binding to check warmstart information
239277
warmstart = swsscommon.WarmStart()
240278
warmstart.initialize("neighsyncd", "swss")
241279
warmstart.checkWarmStart("neighsyncd", "swss", False)
242280

243281
# if swss or system warm reboot not enabled, don't run
244282
if not warmstart.isWarmStart():
245-
print "restore_neighbors service is skipped as warm restart not enabled"
283+
log_info ("restore_neighbors service is skipped as warm restart not enabled")
246284
return
247285

248286
# swss restart not system warm reboot, set statedb directly
249287
if not warmstart.isSystemWarmRebootEnabled():
250288
set_statedb_neigh_restore_done()
251-
print "restore_neighbors service is done as system warm reboot not enabled"
289+
log_info ("restore_neighbors service is done as system warm reboot not enabled")
252290
return
253-
254291
# read the neigh table from appDB to internal map
255292
try:
256293
intf_neigh_map = read_neigh_table_to_maps()
@@ -266,7 +303,7 @@ def main():
266303

267304
# set statedb to signal other processes like neighsyncd
268305
set_statedb_neigh_restore_done()
269-
print "restore_neighbor service is done for system warmreboot"
306+
log_info ("restore_neighbor service is done for system warmreboot")
270307
return
271308

272309
if __name__ == '__main__':

0 commit comments

Comments
 (0)