Skip to content

Commit

Permalink
Shutdown all eBGP neighbors when running SFP platform tests (#4950)
Browse files Browse the repository at this point in the history
* Shutdown all eBGP neighbors when running SFP platform tests

The SFP tests sequentially do negative actions on all the SFPs on all the DUTs. The only check is that the
command/API used worked - response is OK. Most of these tests will cause a quick link bounce (a few seconds)

In a scaled topology (like a T2 VOQ chassis) with 8K-12K routes per eBGP session, such quick link bounce
of all the links within a span of a minute causes a lot of stress on the CPU of the DUTs. A top on one of
the DUTs during the a test run on a linecard in a VoQ chassis shows that processes like 'zebra', 'bgpd',
'redis-server' 'orchargent' etc. are pegging at 50%-160% because of all the routes that have to be
deleted and propogated to all the remote iBGP peers. This causes the VoQ related tables to be in missing
some info. Specifically, we see that some eBGP neighbors are missing in the 'ip neighbor table' as well
as the ASIC_DB on the remote asics.

To avoid this stress, we shutdown eBGP on all the neighbors on all the DUTs before running the SFP related tests.
With eBGP disabled and thus all routes gone from eBGP, top shows that the processes are running at 2%-6% range as expected.

To accomplish this added a module scope fixture 'shutdown_ebgp' in duthost_utils that would:
- In setup:
  - get the number of total eBGP routes - using 'show ip route summary'
  - shutdown all eBgp neighbors
  - verify that the eBGP routes are 0
- In cleanup:
  - startup all eBGP neighbors
  - verify that the eBGP routes are the same what they were before we shutdown all eBGP neighbors

We then call this fixure in the SFP tests as needed.

* As part of shutdown_ebgp, check the orchangent CPU utilization (#4950 - sanmalho-git:shut_bgp)

Added check to make sure that the cpu utilization of orchagent after 'shutdown' or
'startup' of all eBGP neighbors is less than 10% for 5 consecutive seconds

* review comments

* Use a jitter of 5 in checking number of ebgp routes

Co-authored-by: Manouchehr Taheri <manouchehr.taheri@nokia.com>
  • Loading branch information
sanmalho-git and mannytaheri authored Jan 30, 2022
1 parent 7530465 commit b36f7db
Show file tree
Hide file tree
Showing 3 changed files with 74 additions and 4 deletions.
70 changes: 69 additions & 1 deletion tests/common/fixtures/duthost_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,12 @@
import itertools
import collections
import ipaddress

import time
from tests.common.helpers.assertions import pytest_assert
from tests.common.utilities import wait_until
from jinja2 import Template


logger = logging.getLogger(__name__)


Expand Down Expand Up @@ -164,6 +167,71 @@ def ports_list(duthosts, rand_one_dut_hostname, rand_selected_dut, tbinfo):
return ports


def check_orch_cpu_utilization(dut, orch_cpu_threshold):
"""
Compare orchagent CPU utilization 5 times, with 1 second interval in between and make sure all 5 readings are
less than threshold
Args:
dut: DUT host object
orch_cpu_threshold: orch cpu threshold
"""
for i in range(5):
orch_cpu = dut.shell("COLUMNS=512 show processes cpu | grep orchagent | awk '{print $9}'")["stdout_lines"]
for line in orch_cpu:
if int(float(line)) > orch_cpu_threshold:
return False
time.sleep(1)
return True


def check_ebgp_routes(num_v4_routes, num_v6_routes, duthost):
MAX_DIFF = 5
sumv4, sumv6 = duthost.get_ip_route_summary()
rtn_val = True
if 'ebgp' in sumv4 and 'routes' in sumv4['ebgp'] and abs(int(float(sumv4['ebgp']['routes'])) - int(float(num_v4_routes))) >= MAX_DIFF:
rtn_val = False
if 'ebgp' in sumv6 and 'routes' in sumv6['ebgp'] and abs(int(float(sumv6['ebgp']['routes'])) - int(float(num_v6_routes))) >= MAX_DIFF:
rtn_val = False
return rtn_val

@pytest.fixture(scope="module")
def shutdown_ebgp(duthosts):
# To store the original number of eBGP v4 and v6 routes.
v4ebgps = {}
v6ebgps = {}
orch_cpu_threshold = 10
for duthost in duthosts.frontend_nodes:
# Get the original number of eBGP v4 and v6 routes on the DUT.
sumv4, sumv6 = duthost.get_ip_route_summary()
v4ebgps[duthost.hostname] = sumv4.get('ebgp', {'routes': 0})['routes']
v6ebgps[duthost.hostname] = sumv6.get('ebgp', {'routes': 0})['routes']
# Shutdown all eBGP neighbors
duthost.command("sudo config bgp shutdown all")
# Verify that the total eBGP routes are 0.
pytest_assert(wait_until(30, 2, 0, check_ebgp_routes, 0, 0, duthost),
"eBGP routes are not 0 after shutting down all neighbors on {}".format(duthost))
pytest_assert(wait_until(60, 2, 0, check_orch_cpu_utilization, duthost, orch_cpu_threshold),
"Orch CPU utilization {} > orch cpu threshold {} after shutdown all eBGP"
.format(duthost.shell("show processes cpu | grep orchagent | awk '{print $9}'")["stdout"],
orch_cpu_threshold))

yield

for duthost in duthosts.frontend_nodes:
# Startup all the eBGP neighbors
duthost.command("sudo config bgp startup all")

for duthost in duthosts.frontend_nodes:
# Verify that total eBGP routes are what they were before shutdown of all eBGP neighbors
orig_v4_ebgp = v4ebgps[duthost.hostname]
orig_v6_ebgp = v6ebgps[duthost.hostname]
pytest_assert(wait_until(120, 10, 10, check_ebgp_routes, orig_v4_ebgp, orig_v6_ebgp, duthost),
"eBGP v4 routes are {}, and v6 route are {}, and not what they were originally after enabling all neighbors on {}".format(orig_v4_ebgp, orig_v6_ebgp, duthost))
pytest_assert(wait_until(60, 2, 0, check_orch_cpu_utilization, duthost, orch_cpu_threshold),
"Orch CPU utilization {} > orch cpu threshold {} after startup all eBGP"
.format(duthost.shell("show processes cpu | grep orchagent | awk '{print $9}'")["stdout"],
orch_cpu_threshold))
@pytest.fixture(scope="module")
def utils_vlan_ports_list(duthosts, rand_one_dut_hostname, rand_selected_dut, tbinfo, ports_list):
"""
Expand Down
3 changes: 2 additions & 1 deletion tests/platform_tests/api/test_sfp.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from tests.common.platform.interface_utils import get_physical_port_indices
from tests.common.utilities import wait_until
from tests.common.fixtures.conn_graph_facts import conn_graph_facts
from tests.common.fixtures.duthost_utils import shutdown_ebgp

from platform_api_test_base import PlatformApiTestBase

Expand All @@ -30,7 +31,7 @@
]

@pytest.fixture(scope="class")
def setup(request, duthosts, enum_rand_one_per_hwsku_hostname, xcvr_skip_list, conn_graph_facts):
def setup(request, duthosts, enum_rand_one_per_hwsku_hostname, xcvr_skip_list, conn_graph_facts, shutdown_ebgp):
sfp_setup = {}
duthost = duthosts[enum_rand_one_per_hwsku_hostname]

Expand Down
5 changes: 3 additions & 2 deletions tests/platform_tests/sfp/test_sfputil.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
from util import parse_output
from util import get_dev_conn
from tests.common.utilities import skip_release
from tests.common.fixtures.duthost_utils import shutdown_ebgp

cmd_sfp_presence = "sudo sfputil show presence"
cmd_sfp_eeprom = "sudo sfputil show eeprom"
Expand Down Expand Up @@ -86,7 +87,7 @@ def test_check_sfputil_eeprom(duthosts, enum_rand_one_per_hwsku_frontend_hostnam
assert parsed_eeprom[intf] == "SFP EEPROM detected"


def test_check_sfputil_reset(duthosts, enum_rand_one_per_hwsku_frontend_hostname, enum_frontend_asic_index, conn_graph_facts, tbinfo, xcvr_skip_list):
def test_check_sfputil_reset(duthosts, enum_rand_one_per_hwsku_frontend_hostname, enum_frontend_asic_index, conn_graph_facts, tbinfo, xcvr_skip_list, shutdown_ebgp):
"""
@summary: Check SFP presence using 'sfputil show presence'
"""
Expand Down Expand Up @@ -125,7 +126,7 @@ def test_check_sfputil_reset(duthosts, enum_rand_one_per_hwsku_frontend_hostname
"Some interfaces are down: {}".format(intf_facts["ansible_interface_link_down_ports"])


def test_check_sfputil_low_power_mode(duthosts, enum_rand_one_per_hwsku_frontend_hostname, enum_frontend_asic_index, conn_graph_facts, tbinfo, xcvr_skip_list):
def test_check_sfputil_low_power_mode(duthosts, enum_rand_one_per_hwsku_frontend_hostname, enum_frontend_asic_index, conn_graph_facts, tbinfo, xcvr_skip_list, shutdown_ebgp):
"""
@summary: Check SFP low power mode
Expand Down

0 comments on commit b36f7db

Please sign in to comment.