Skip to content

Commit

Permalink
speed up route check 202405 v2 (#3665)
Browse files Browse the repository at this point in the history
This PR fixes sonic-net/sonic-buildimage#18773 and sonic-net/sonic-buildimage#20648

Note: resubmitting the PR. The original PR was reverted due to PR check failure. The failure has been root caused and fixed via sonic-net/sonic-mgmt#16013
  • Loading branch information
deepak-singhal0408 authored Dec 15, 2024
1 parent e39964d commit 3561996
Showing 1 changed file with 132 additions and 103 deletions.
235 changes: 132 additions & 103 deletions scripts/route_check.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@
import signal
import traceback
import subprocess
import concurrent.futures

from ipaddress import ip_network
from swsscommon import swsscommon
Expand Down Expand Up @@ -348,10 +349,18 @@ def is_suppress_fib_pending_enabled(namespace):
return state == 'enabled'


def get_frr_routes(namespace):
def fetch_routes(cmd):
"""
Read routes from zebra through CLI command
:return frr routes dictionary
Fetch routes using the given command.
"""
output = subprocess.check_output(cmd, text=True)
return json.loads(output)


def get_frr_routes_parallel(namespace):
"""
Read routes from zebra through CLI command for IPv4 and IPv6 in parallel
:return combined IPv4 and IPv6 routes dictionary.
"""
if namespace == multi_asic.DEFAULT_NAMESPACE:
v4_route_cmd = ['show', 'ip', 'route', 'json']
Expand All @@ -360,12 +369,16 @@ def get_frr_routes(namespace):
v4_route_cmd = ['show', 'ip', 'route', '-n', namespace, 'json']
v6_route_cmd = ['show', 'ipv6', 'route', '-n', namespace, 'json']

output = subprocess.check_output(v4_route_cmd, text=True)
routes = json.loads(output)
output = subprocess.check_output(v6_route_cmd, text=True)
routes.update(json.loads(output))
print_message(syslog.LOG_DEBUG, "FRR Routes: namespace={}, routes={}".format(namespace, routes))
return routes
with concurrent.futures.ThreadPoolExecutor() as executor:
future_v4 = executor.submit(fetch_routes, v4_route_cmd)
future_v6 = executor.submit(fetch_routes, v6_route_cmd)

v4_routes = future_v4.result()
v6_routes = future_v6.result()

v4_routes.update(v6_routes)
print_message(syslog.LOG_DEBUG, "FRR Routes: namespace={}, routes={}".format(namespace, v4_routes))
return v4_routes


def get_interfaces(namespace):
Expand Down Expand Up @@ -566,7 +579,7 @@ def check_frr_pending_routes(namespace):
retries = FRR_CHECK_RETRIES
for i in range(retries):
missed_rt = []
frr_routes = get_frr_routes(namespace)
frr_routes = get_frr_routes_parallel(namespace)

for _, entries in frr_routes.items():
for entry in entries:
Expand Down Expand Up @@ -699,25 +712,99 @@ def _filter_out_neigh_route(routes, neighs):
return rt_appl_miss, rt_asic_miss


def check_routes(namespace):
def check_routes_for_namespace(namespace):
"""
Process a Single Namespace:
The heart of this script which runs the checks.
Read APPL-DB & ASIC-DB, the relevant tables for route checking.
Checkout routes in ASIC-DB to match APPL-DB, discounting local &
default routes. In case of missed / unexpected entries in ASIC,
it might be due to update latency between APPL & ASIC DBs. So collect
ASIC-DB subscribe updates for a second, and checkout if you see SET
command for missing ones & DEL command for unexpectes ones in ASIC.
If there are still some unjustifiable diffs, between APPL & ASIC DB,
related to routes report failure, else all good.
If there are FRR routes that aren't marked offloaded but all APPL & ASIC DB
routes are in sync report failure and perform a mitigation action.
:return (0, None) on sucess, else (-1, results) where results holds
the unjustifiable entries.
"""
results = {}
adds = []
deletes = []
intf_appl_miss = []
rt_appl_miss = []
rt_asic_miss = []
rt_frr_miss = []

selector, subs, rt_asic = get_asicdb_routes(namespace)

rt_appl = get_appdb_routes(namespace)
intf_appl = get_interfaces(namespace)

# Diff APPL-DB routes & ASIC-DB routes
rt_appl_miss, rt_asic_miss = diff_sorted_lists(rt_appl, rt_asic)

# Check missed ASIC routes against APPL-DB INTF_TABLE
_, rt_asic_miss = diff_sorted_lists(intf_appl, rt_asic_miss)
rt_asic_miss = filter_out_default_routes(rt_asic_miss)
rt_asic_miss = filter_out_vnet_routes(namespace, rt_asic_miss)
rt_asic_miss = filter_out_standalone_tunnel_routes(namespace, rt_asic_miss)
rt_asic_miss = filter_out_soc_ip_routes(namespace, rt_asic_miss)

# Check APPL-DB INTF_TABLE with ASIC table route entries
intf_appl_miss, _ = diff_sorted_lists(intf_appl, rt_asic)

if rt_appl_miss:
rt_appl_miss = filter_out_local_interfaces(namespace, rt_appl_miss)

if rt_appl_miss:
rt_appl_miss = filter_out_voq_neigh_routes(namespace, rt_appl_miss)

# NOTE: On dualtor environment, ignore any route miss for the
# neighbors learned from the vlan subnet.
if rt_appl_miss or rt_asic_miss:
rt_appl_miss, rt_asic_miss = filter_out_vlan_neigh_route_miss(namespace, rt_appl_miss, rt_asic_miss)

if rt_appl_miss or rt_asic_miss:
# Look for subscribe updates for a second
adds, deletes = get_subscribe_updates(selector, subs)

# Drop all those for which SET received
rt_appl_miss, _ = diff_sorted_lists(rt_appl_miss, adds)

# Drop all those for which DEL received
rt_asic_miss, _ = diff_sorted_lists(rt_asic_miss, deletes)

if rt_appl_miss:
results["missed_ROUTE_TABLE_routes"] = rt_appl_miss

if intf_appl_miss:
results["missed_INTF_TABLE_entries"] = intf_appl_miss

if rt_asic_miss:
results["Unaccounted_ROUTE_ENTRY_TABLE_entries"] = rt_asic_miss

if is_bgp_suppress_fib_pending_enabled(namespace):
rt_frr_miss = check_frr_pending_routes(namespace)

if rt_frr_miss:
results["missed_FRR_routes"] = rt_frr_miss

if results:
if rt_frr_miss and not rt_appl_miss and not rt_asic_miss:
print_message(syslog.LOG_ERR, "Some routes are not set offloaded in FRR{} but all "
"routes in APPL_DB and ASIC_DB are in sync".format(namespace))
if is_suppress_fib_pending_enabled(namespace):
mitigate_installed_not_offloaded_frr_routes(namespace, rt_frr_miss, rt_appl)

return results, adds, deletes


def check_routes(namespace):
"""
Main function to parallelize route checks across all namespaces.
"""
namespace_list = []
if namespace is not multi_asic.DEFAULT_NAMESPACE and namespace in multi_asic.get_namespace_list():
namespace_list.append(namespace)
Expand All @@ -726,95 +813,37 @@ def check_routes(namespace):
print_message(syslog.LOG_INFO, "Checking routes for namespaces: ", namespace_list)

results = {}
adds = {}
deletes = {}
for namespace in namespace_list:
intf_appl_miss = []
rt_appl_miss = []
rt_asic_miss = []
rt_frr_miss = []
adds[namespace] = []
deletes[namespace] = []

selector, subs, rt_asic = get_asicdb_routes(namespace)

rt_appl = get_appdb_routes(namespace)
intf_appl = get_interfaces(namespace)

# Diff APPL-DB routes & ASIC-DB routes
rt_appl_miss, rt_asic_miss = diff_sorted_lists(rt_appl, rt_asic)

# Check missed ASIC routes against APPL-DB INTF_TABLE
_, rt_asic_miss = diff_sorted_lists(intf_appl, rt_asic_miss)
rt_asic_miss = filter_out_default_routes(rt_asic_miss)
rt_asic_miss = filter_out_vnet_routes(namespace, rt_asic_miss)
rt_asic_miss = filter_out_standalone_tunnel_routes(namespace, rt_asic_miss)
rt_asic_miss = filter_out_soc_ip_routes(namespace, rt_asic_miss)


# Check APPL-DB INTF_TABLE with ASIC table route entries
intf_appl_miss, _ = diff_sorted_lists(intf_appl, rt_asic)

if rt_appl_miss:
rt_appl_miss = filter_out_local_interfaces(namespace, rt_appl_miss)

if rt_appl_miss:
rt_appl_miss = filter_out_voq_neigh_routes(namespace, rt_appl_miss)

# NOTE: On dualtor environment, ignore any route miss for the
# neighbors learned from the vlan subnet.
if rt_appl_miss or rt_asic_miss:
rt_appl_miss, rt_asic_miss = filter_out_vlan_neigh_route_miss(namespace, rt_appl_miss, rt_asic_miss)

if rt_appl_miss or rt_asic_miss:
# Look for subscribe updates for a second
adds[namespace], deletes[namespace] = get_subscribe_updates(selector, subs)

# Drop all those for which SET received
rt_appl_miss, _ = diff_sorted_lists(rt_appl_miss, adds[namespace])

# Drop all those for which DEL received
rt_asic_miss, _ = diff_sorted_lists(rt_asic_miss, deletes[namespace])

if rt_appl_miss:
if namespace not in results:
results[namespace] = {}
results[namespace]["missed_ROUTE_TABLE_routes"] = rt_appl_miss

if intf_appl_miss:
if namespace not in results:
results[namespace] = {}
results[namespace]["missed_INTF_TABLE_entries"] = intf_appl_miss

if rt_asic_miss:
if namespace not in results:
results[namespace] = {}
results[namespace]["Unaccounted_ROUTE_ENTRY_TABLE_entries"] = rt_asic_miss

if is_bgp_suppress_fib_pending_enabled(namespace):
rt_frr_miss = check_frr_pending_routes(namespace)

if rt_frr_miss:
if namespace not in results:
results[namespace] = {}
results[namespace]["missed_FRR_routes"] = rt_frr_miss

if results:
if rt_frr_miss and not rt_appl_miss and not rt_asic_miss:
print_message(syslog.LOG_ERR, "Some routes are not set offloaded in FRR{} but all "
"routes in APPL_DB and ASIC_DB are in sync".format(namespace))
if is_suppress_fib_pending_enabled(namespace):
mitigate_installed_not_offloaded_frr_routes(namespace, rt_frr_miss, rt_appl)

if results:
print_message(syslog.LOG_WARNING, "Failure results: {", json.dumps(results, indent=4), "}")
print_message(syslog.LOG_WARNING, "Failed. Look at reported mismatches above")
print_message(syslog.LOG_WARNING, "add: ", json.dumps(adds, indent=4))
print_message(syslog.LOG_WARNING, "del: ", json.dumps(deletes, indent=4))
return -1, results
else:
print_message(syslog.LOG_INFO, "All good!")
return 0, None
all_adds = {}
all_deletes = {}

# Use ThreadPoolExecutor to parallelize the check for each namespace
with concurrent.futures.ThreadPoolExecutor() as executor:
futures = {executor.submit(check_routes_for_namespace, ns): ns for ns in namespace_list}

for future in concurrent.futures.as_completed(futures):
ns = futures[future]
all_adds[ns] = []
all_deletes[ns] = []
try:
result, adds, deletes = future.result()
if result:
results[ns] = result
all_adds[ns] = adds
all_deletes[ns] = deletes
except Exception as e:
print_message(syslog.LOG_ERR, "Error processing namespace {}: {}".format(ns, e))
return -1, results

if results:
print_message(syslog.LOG_WARNING, "Failure results: {", json.dumps(results, indent=4), "}")
print_message(syslog.LOG_WARNING, "Failed. Look at reported mismatches above")
print_message(syslog.LOG_WARNING, "add: ", json.dumps(all_adds, indent=4))
print_message(syslog.LOG_WARNING, "del: ", json.dumps(all_deletes, indent=4))
return -1, results
else:
print_message(syslog.LOG_INFO, "All good!")
return 0, None


def main():
"""
Expand Down

0 comments on commit 3561996

Please sign in to comment.