From 06f8b1f98a120d87d9b9e4d265aa1ff14df0f95d Mon Sep 17 00:00:00 2001 From: Stepan Blyshchak <38952541+stepanblyschak@users.noreply.github.com> Date: Thu, 6 Oct 2022 18:06:46 +0300 Subject: [PATCH] [auto-ts] add memory check (#10433) (#12291) #### Why I did it To support automatic techsupport invokation in case memory usage is too high. #### How I did it Implemented according to https://github.com/Azure/SONiC/pull/939 #### How to verify it UT, manual test on the switch. *DEPENDS* on https://github.com/Azure/sonic-utilities/pull/2116 --- files/build_templates/init_cfg.json.j2 | 5 +- files/image_config/monit/conf.d/sonic-host | 3 + .../tests/auto_techsupport.json | 18 ++++- .../tests_config/auto_techsupport.json | 69 +++++++++++++++++-- .../yang-models/sonic-auto_techsupport.yang | 18 +++++ 5 files changed, 103 insertions(+), 10 deletions(-) diff --git a/files/build_templates/init_cfg.json.j2 b/files/build_templates/init_cfg.json.j2 index 31683753ed95..7de0ad977807 100644 --- a/files/build_templates/init_cfg.json.j2 +++ b/files/build_templates/init_cfg.json.j2 @@ -84,6 +84,8 @@ "rate_limit_interval" : "180", "max_techsupport_limit" : "10.0", "max_core_limit" : "5.0", + "available_mem_threshold": "10.0", + "min_available_mem": "200", "since" : "2 days ago" } }, @@ -93,7 +95,8 @@ {%- if enable_auto_tech_support == "y" %} "state" : "enabled", {% else %} "state" : "disabled", {% endif %} - "rate_limit_interval" : "600" + "rate_limit_interval" : "600", + "available_mem_threshold": "10.0" }{%if not loop.last %},{% endif -%} {% endfor %} }, diff --git a/files/image_config/monit/conf.d/sonic-host b/files/image_config/monit/conf.d/sonic-host index ceebf1003eb2..d65325207543 100644 --- a/files/image_config/monit/conf.d/sonic-host +++ b/files/image_config/monit/conf.d/sonic-host @@ -46,3 +46,6 @@ check program vnetRouteCheck with path "/usr/local/bin/vnet_route_check.py" every 5 cycles if status != 0 for 3 cycle then alert repeat every 1 cycles +# memory_check tool that verifies that memory usage does not cross the threshold or invokes techsupport. +check program memory_check with path "/usr/local/bin/memory_threshold_check.py" + if status == 2 for 10 times within 20 cycles then exec "/usr/local/bin/memory_threshold_check_handler.py" diff --git a/src/sonic-yang-models/tests/yang_model_tests/tests/auto_techsupport.json b/src/sonic-yang-models/tests/yang_model_tests/tests/auto_techsupport.json index 8a65fec2b6d0..a335dcfd389c 100644 --- a/src/sonic-yang-models/tests/yang_model_tests/tests/auto_techsupport.json +++ b/src/sonic-yang-models/tests/yang_model_tests/tests/auto_techsupport.json @@ -8,7 +8,7 @@ }, "AUTO_TECHSUPPORT_INVALID_RATE_LIMIT_FORMAT": { "desc" : "Configure cooloff with a value of invalid format", - "eStrKey": "InvalidValue" + "eStrKey": "InvalidValue" }, "AUTO_TECHSUPPORT_OUT_OF_RANGE_DECIMAL": { "desc" : "Configure a value for core-uage outside the range [0, 100)", @@ -19,9 +19,23 @@ }, "AUTO_TECHSUPPORT_INVALID_FRACTION_DIGITS": { "desc" : "Configure a value for max_techsupport_size inside the range [0, 100) but with 3 fractional digits", - "eStrKey": "InvalidValue" + "eStrKey": "InvalidValue" }, "AUTO_TECHSUPPORT_RATE_LIMIT_INTERVAL_TEST": { "desc" : "Configure and test the valid configuration" + }, + "AUTO_TECHSUPPORT_AVAILABLE_MEM_THRESHOLD": { + "desc" : "Configure and test the valid configuration" + }, + "AUTO_TECHSUPPORT_INVALID_AVAILABLE_MEM_THRESHOLD": { + "desc" : "Configure a value for available_mem_threshold inside the range [0, 100) but with 3 fractional digits", + "eStrKey": "InvalidValue" + }, + "AUTO_TECHSUPPORT_GLOBAL_MEM_THRESHOLD_VALID": { + "desc" : "Configure and test the valid configuration" + }, + "AUTO_TECHSUPPORT_GLOBAL_MEM_THRESHOLD_INVALID_THRESHOLD": { + "desc" : "Configure a value for available_mem_threshold inside the range [0, 100) but with 3 fractional digits", + "eStrKey": "InvalidValue" } } diff --git a/src/sonic-yang-models/tests/yang_model_tests/tests_config/auto_techsupport.json b/src/sonic-yang-models/tests/yang_model_tests/tests_config/auto_techsupport.json index 43ac4ce82391..c8933bfe42fa 100644 --- a/src/sonic-yang-models/tests/yang_model_tests/tests_config/auto_techsupport.json +++ b/src/sonic-yang-models/tests/yang_model_tests/tests_config/auto_techsupport.json @@ -8,7 +8,7 @@ "max_techsupport_limit" : "10.0", "max_core_limit" : "5.0", "since" : "2 days ago" - } + } } } }, @@ -20,8 +20,8 @@ "rate_limit_interval" : "180", "max_techsupport_limit" : "10.0", "max_core_limit" : "5.0", - "since" : "2 days ago" - } + "since" : "2 days ago" + } } } }, @@ -30,7 +30,7 @@ "sonic-auto_techsupport:AUTO_TECHSUPPORT": { "sonic-auto_techsupport:GLOBAL": { "rate_limit_interval" : "whatever" - } + } } } }, @@ -40,7 +40,7 @@ "sonic-auto_techsupport:GLOBAL": { "max_core_limit" : "100.00", "rate_limit_interval" : "180" - } + } } } }, @@ -50,7 +50,7 @@ "sonic-auto_techsupport:GLOBAL": { "max_techsupport_limit" : "11.23", "max_core_limit" : "99.99" - } + } } } }, @@ -60,7 +60,7 @@ "sonic-auto_techsupport:GLOBAL": { "max_techsupport_limit" : "11.111", "max_core_limit" : "99.99" - } + } } } }, @@ -81,5 +81,60 @@ ] } } + }, + "AUTO_TECHSUPPORT_GLOBAL_MEM_THRESHOLD_VALID": { + "sonic-auto_techsupport:sonic-auto_techsupport": { + "sonic-auto_techsupport:AUTO_TECHSUPPORT": { + "sonic-auto_techsupport:GLOBAL": { + "available_mem_threshold": "10.0", + "min_available_mem": "900" + } + } + } + }, + "AUTO_TECHSUPPORT_GLOBAL_MEM_THRESHOLD_INVALID_THRESHOLD": { + "sonic-auto_techsupport:sonic-auto_techsupport": { + "sonic-auto_techsupport:AUTO_TECHSUPPORT": { + "sonic-auto_techsupport:GLOBAL": { + "available_mem_threshold": "11.111" + } + } + } + }, + "AUTO_TECHSUPPORT_AVAILABLE_MEM_THRESHOLD": { + "sonic-auto_techsupport:sonic-auto_techsupport": { + "sonic-auto_techsupport:AUTO_TECHSUPPORT_FEATURE": { + "AUTO_TECHSUPPORT_FEATURE_LIST": [ + { + "feature_name" : "bgp", + "state" : "enabled", + "available_mem_threshold": "10.0" + }, + { + "feature_name" : "swss", + "state" : "disabled", + "available_mem_threshold": "10.0" + } + ] + } + } + }, + "AUTO_TECHSUPPORT_INVALID_AVAILABLE_MEM_THRESHOLD": { + "sonic-auto_techsupport:sonic-auto_techsupport": { + "sonic-auto_techsupport:AUTO_TECHSUPPORT_FEATURE": { + "AUTO_TECHSUPPORT_FEATURE_LIST": [ + { + "feature_name" : "bgp", + "state" : "enabled", + "available_mem_threshold": "11.111" + }, + { + "feature_name" : "swss", + "state" : "disabled", + "available_mem_threshold": "10.0" + } + ] + } + } } } diff --git a/src/sonic-yang-models/yang-models/sonic-auto_techsupport.yang b/src/sonic-yang-models/yang-models/sonic-auto_techsupport.yang index 94934d7ab201..02e29463d594 100644 --- a/src/sonic-yang-models/yang-models/sonic-auto_techsupport.yang +++ b/src/sonic-yang-models/yang-models/sonic-auto_techsupport.yang @@ -59,6 +59,18 @@ module sonic-auto_techsupport { description "Max Limit in percentage for the cummulative size of core dumps. No cleanup is performed if the value isn't congiured or is 0.0"; type decimal-repr; } + + leaf available_mem_threshold { + description "Memory threshold; 0 to disable techsupport invocation on memory usage threshold crossing"; + type decimal-repr; + default 10.0; + } + + leaf min_available_mem { + description "Minimum Free memory (in MB) that should be available for the techsupport execution to start"; + type uint32; + default 200; + } leaf since { /* @@ -96,6 +108,12 @@ module sonic-auto_techsupport { type stypes:admin_mode; } + leaf available_mem_threshold { + description "Memory threshold; 0 to disable techsupport invocation on memory usage threshold crossing"; + type decimal-repr; + default 10.0; + } + leaf rate_limit_interval { description "Rate limit interval for the corresponding feature. Configure 0 to explicitly disable"; type uint16;