Recover primary nic if down after publishing hostname in RedhatOSUtil (…

…#3024) * Check nic state and recover if down: * Fix typo * Fix state comparison * Fix pylint errors * Fix string comparison * Report publish hostname failure in calling thread * Add todo to check nic state for all distros where we reset network * Update detection to check connection state and separate recover from publish * Pylint unused argument * refactor recover_nic argument * Network interface e2e test * e2e test for recovering the network interface on redhat distros * Only run scenario on distros which use RedhatOSUtil * Fix call to parent publish_hostname to include recover_nic arg * Update comments in default os util * Remove comment * Fix comment * Do not do detection/recover on RedhatOSMOdernUtil * Resolve PR comments * Make script executable * Revert pypy change * Fix publish hostname paramters
Azure · Feb 5, 2024 · 0cd8617 · 0cd8617
1 parent 25d7103
commit 0cd8617
Show file tree

Hide file tree

Showing 11 changed files with 308 additions and 14 deletions.
diff --git a/azurelinuxagent/common/event.py b/azurelinuxagent/common/event.py
@@ -95,6 +95,7 @@ class WALAEventOperation:
     HealthCheck = "HealthCheck"
     HealthObservation = "HealthObservation"
     HeartBeat = "HeartBeat"
+    HostnamePublishing = "HostnamePublishing"
     HostPlugin = "HostPlugin"
     HostPluginHeartbeat = "HostPluginHeartbeat"
     HostPluginHeartbeatExtended = "HostPluginHeartbeatExtended"

diff --git a/azurelinuxagent/common/osutil/default.py b/azurelinuxagent/common/osutil/default.py
@@ -1190,11 +1190,20 @@ def restart_if(self, ifname, retries=3, wait=5):
             else:
                 logger.warn("exceeded restart retries")
 
-    def publish_hostname(self, hostname):
+    def check_and_recover_nic_state(self, ifname):
+        # TODO: This should be implemented for all distros where we reset the network during publishing hostname. Currently it is only implemented in RedhatOSUtil.
+        pass
+
+    def publish_hostname(self, hostname, recover_nic=False):
+        """
+        Publishes the provided hostname.
+        """
         self.set_dhcp_hostname(hostname)
         self.set_hostname_record(hostname)
         ifname = self.get_if_name()
         self.restart_if(ifname)
+        if recover_nic:
+            self.check_and_recover_nic_state(ifname)
 
     def set_scsi_disks_timeout(self, timeout):
         for dev in os.listdir("/sys/block"):

diff --git a/azurelinuxagent/common/osutil/gaia.py b/azurelinuxagent/common/osutil/gaia.py
@@ -202,7 +202,7 @@ def set_hostname(self, hostname):
     def set_dhcp_hostname(self, hostname):
         logger.warn('set_dhcp_hostname is ignored on GAiA')
 
-    def publish_hostname(self, hostname):
+    def publish_hostname(self, hostname, recover_nic=False):
         logger.warn('publish_hostname is ignored on GAiA')
 
     def del_account(self, username):

diff --git a/azurelinuxagent/common/osutil/iosxe.py b/azurelinuxagent/common/osutil/iosxe.py
@@ -58,12 +58,12 @@ def set_hostname(self, hostname):
             logger.warn("[{0}] failed with error: {1}, attempting fallback".format(' '.join(hostnamectl_cmd), ustr(e)))
             DefaultOSUtil.set_hostname(self, hostname)
 
-    def publish_hostname(self, hostname):
+    def publish_hostname(self, hostname, recover_nic=False):
         """
         Restart NetworkManager first before publishing hostname
         """
         shellutil.run("service NetworkManager restart")
-        super(IosxeOSUtil, self).publish_hostname(hostname)
+        super(IosxeOSUtil, self).publish_hostname(hostname, recover_nic)
 
     def register_agent_service(self):
         return shellutil.run("systemctl enable waagent", chk_err=False)

diff --git a/azurelinuxagent/common/osutil/redhat.py b/azurelinuxagent/common/osutil/redhat.py
@@ -117,12 +117,11 @@ def set_hostname(self, hostname):
             logger.warn("[{0}] failed, attempting fallback".format(' '.join(hostnamectl_cmd)))
             DefaultOSUtil.set_hostname(self, hostname)
 
-    def get_nm_controlled(self):
-        ifname = self.get_if_name()
+    def get_nm_controlled(self, ifname):
         filepath = "/etc/sysconfig/network-scripts/ifcfg-{0}".format(ifname)
         nm_controlled_cmd = ['grep', 'NM_CONTROLLED=', filepath]
         try:
-            result = shellutil.run_command(nm_controlled_cmd, log_error=False, encode_output=False).rstrip()
+            result = shellutil.run_command(nm_controlled_cmd, log_error=False).rstrip()
 
             if result and len(result.split('=')) > 1:
                 # Remove trailing white space and ' or " characters
@@ -140,17 +139,87 @@ def get_nm_controlled(self):
 
         return True
 
-    def publish_hostname(self, hostname):
+    def get_nic_operational_and_general_states(self, ifname):
+        """
+        Checks the contents of /sys/class/net/{ifname}/operstate and the results of 'nmcli -g general.state device show {ifname}' to determine the state of the provided interface.
+        Raises an exception if the network interface state cannot be determined.
+        """
+        filepath = "/sys/class/net/{0}/operstate".format(ifname)
+        nic_general_state_cmd = ['nmcli', '-g', 'general.state', 'device', 'show', ifname]
+        if not os.path.isfile(filepath):
+            msg = "Unable to determine primary network interface {0} state, because state file does not exist: {1}".format(ifname, filepath)
+            logger.warn(msg)
+            raise Exception(msg)
+
+        try:
+            nic_oper_state = fileutil.read_file(filepath).rstrip().lower()
+            nic_general_state = shellutil.run_command(nic_general_state_cmd, log_error=True).rstrip().lower()
+            if nic_oper_state != "up":
+                logger.warn("The primary network interface {0} operational state is '{1}'.".format(ifname, nic_oper_state))
+            else:
+                logger.info("The primary network interface {0} operational state is '{1}'.".format(ifname, nic_oper_state))
+            if nic_general_state != "100 (connected)":
+                logger.warn("The primary network interface {0} general state is '{1}'.".format(ifname, nic_general_state))
+            else:
+                logger.info("The primary network interface {0} general state is '{1}'.".format(ifname, nic_general_state))
+            return nic_oper_state, nic_general_state
+        except Exception as e:
+            msg = "Unexpected error while determining the primary network interface state: {0}".format(e)
+            logger.warn(msg)
+            raise Exception(msg)
+
+    def check_and_recover_nic_state(self, ifname):
+        """
+        Checks if the provided network interface is in an 'up' state. If the network interface is in a 'down' state,
+        attempt to recover the interface by restarting the Network Manager service.
+
+        Raises an exception if an attempt to bring the interface into an 'up' state fails, or if the state
+         of the network interface cannot be determined.
+        """
+        nic_operstate, nic_general_state = self.get_nic_operational_and_general_states(ifname)
+        if nic_operstate == "down" or "disconnected" in nic_general_state:
+            logger.info("Restarting the Network Manager service to recover network interface {0}".format(ifname))
+            self.restart_network_manager()
+            # Interface does not come up immediately after NetworkManager restart. Wait 5 seconds before checking
+            # network interface state.
+            time.sleep(5)
+            nic_operstate, nic_general_state = self.get_nic_operational_and_general_states(ifname)
+            # It is possible for network interface to be in an unknown or unmanaged state. Log warning if state is not
+            # down, disconnected, up, or connected
+            if nic_operstate != "up" or nic_general_state != "100 (connected)":
+                msg = "Network Manager restart failed to bring network interface {0} into 'up' and 'connected' state".format(ifname)
+                logger.warn(msg)
+                raise Exception(msg)
+            else:
+                logger.info("Network Manager restart successfully brought the network interface {0} into 'up' and 'connected' state".format(ifname))
+        elif nic_operstate != "up" or nic_general_state != "100 (connected)":
+            # We already logged a warning with the network interface state in get_nic_operstate(). Raise an exception
+            # for the env thread to send to telemetry.
+            raise Exception("The primary network interface {0} operational state is '{1}' and general state is '{2}'.".format(ifname, nic_operstate, nic_general_state))
+
+    def restart_network_manager(self):
+        shellutil.run("service NetworkManager restart")
+
+    def publish_hostname(self, hostname, recover_nic=False):
         """
         Restart NetworkManager first before publishing hostname, only if the network interface is not controlled by the
         NetworkManager service (as determined by NM_CONTROLLED=n in the interface configuration). If the NetworkManager
         service is restarted before the agent publishes the hostname, and NM_controlled=y, a race condition may happen
         between the NetworkManager service and the Guest Agent making changes to the network interface configuration
         simultaneously.
+
+        Note: check_and_recover_nic_state(ifname) raises an Exception if an attempt to recover the network interface
+        fails, or if the network interface state cannot be determined. Callers should handle this exception by sending
+        an event to telemetry.
+
+        TODO: Improve failure reporting and add success reporting to telemetry for hostname changes. Right now we are only reporting failures to telemetry by raising an Exception in publish_hostname for the calling thread to handle by reporting the failure to telemetry.
         """
-        if not self.get_nm_controlled():
-            shellutil.run("service NetworkManager restart")
-        super(RedhatOSUtil, self).publish_hostname(hostname)
+        ifname = self.get_if_name()
+        nm_controlled = self.get_nm_controlled(ifname)
+        if not nm_controlled:
+            self.restart_network_manager()
+        # TODO: Current recover logic is only effective when the NetworkManager manages the network interface. Update the recover logic so it is effective even when NM_CONTROLLED=n
+        super(RedhatOSUtil, self).publish_hostname(hostname, recover_nic and nm_controlled)
 
     def register_agent_service(self):
         return shellutil.run("systemctl enable {0}".format(self.service_name), chk_err=False)
@@ -193,7 +262,11 @@ def restart_if(self, ifname, retries=3, wait=5):
             else:
                 logger.warn("exceeded restart retries")
 
-    def publish_hostname(self, hostname):
+    def check_and_recover_nic_state(self, ifname):
+        # TODO: Implement and test a way to recover the network interface for RedhatOSModernUtil
+        pass
+
+    def publish_hostname(self, hostname, recover_nic=False):
         # RedhatOSUtil was updated to conditionally run NetworkManager restart in response to a race condition between
         # NetworkManager restart and the agent restarting the network interface during publish_hostname. Keeping the
         # NetworkManager restart in RedhatOSModernUtil because the issue was not reproduced on these versions.

diff --git a/azurelinuxagent/common/osutil/suse.py b/azurelinuxagent/common/osutil/suse.py
@@ -72,7 +72,7 @@ def __init__(self):
         super(SUSEOSUtil, self).__init__()
         self.dhclient_name = 'wickedd-dhcp4'
 
-    def publish_hostname(self, hostname):
+    def publish_hostname(self, hostname, recover_nic=False):
         self.set_dhcp_hostname(hostname)
         self.set_hostname_record(hostname)
         ifname = self.get_if_name()

diff --git a/azurelinuxagent/ga/env.py b/azurelinuxagent/ga/env.py
@@ -171,7 +171,11 @@ def _operation(self):
                         self._hostname,
                         curr_hostname)
             self._osutil.set_hostname(curr_hostname)
-            self._osutil.publish_hostname(curr_hostname)
+            try:
+                self._osutil.publish_hostname(curr_hostname, recover_nic=True)
+            except Exception as e:
+                msg = "Error while publishing the hostname: {0}".format(e)
+                add_event(AGENT_NAME, op=WALAEventOperation.HostnamePublishing, is_success=False, message=msg, log_event=False)
             self._hostname = curr_hostname
 
 

diff --git a/tests_e2e/test_suites/images.yml b/tests_e2e/test_suites/images.yml
@@ -85,6 +85,7 @@ images:
       locations:
          AzureChinaCloud: []
    centos_610: "OpenLogic CentOS 6.10 latest"
+   centos_75: "OpenLogic CentOS 7.5 latest"
    centos_79: "OpenLogic CentOS 7_9 latest"
    centos_82:
       urn: "OpenLogic CentOS 8_2 latest"
@@ -121,7 +122,14 @@ images:
          AzureChinaCloud: []
          AzureUSGovernment: []
    oracle_610: "Oracle Oracle-Linux 6.10 latest"
+   oracle_75: "Oracle Oracle-Linux 7.5 latest"
+   oracle_79: "Oracle Oracle-Linux ol79-gen2 latest"
+   oracle_82: "Oracle Oracle-Linux ol82-gen2 latest"
    rhel_610: "RedHat RHEL 6.10 latest"
+   rhel_75:
+      urn: "RedHat RHEL 7.5 latest"
+      locations:
+         AzureChinaCloud: []
    rhel_79:
       urn: "RedHat RHEL 7_9 latest"
       locations:

diff --git a/tests_e2e/test_suites/recover_network_interface.yml b/tests_e2e/test_suites/recover_network_interface.yml
@@ -0,0 +1,17 @@
+#
+# Brings the primary network interface down and checks that the agent can recover the network.
+#
+name: "RecoverNetworkInterface"
+tests:
+  - "recover_network_interface/recover_network_interface.py"
+images:
+# TODO: This scenario should be run on all distros which bring the network interface down to publish hostname. Currently, only RedhatOSUtil attempts to recover the network interface if down after hostname publishing.
+  - "centos_79"
+  - "centos_75"
+  - "centos_82"
+  - "rhel_75"
+  - "rhel_79"
+  - "rhel_82"
+  - "oracle_75"
+  - "oracle_79"
+  - "oracle_82"
diff --git a/tests_e2e/tests/recover_network_interface/recover_network_interface.py b/tests_e2e/tests/recover_network_interface/recover_network_interface.py
@@ -0,0 +1,143 @@
+#!/usr/bin/env python3
+
+# Microsoft Azure Linux Agent
+#
+# Copyright 2018 Microsoft Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+#
+# This test uses CSE to bring the network down and call check_and_recover_nic_state to bring the network back into an
+# 'up' and 'connected' state. The intention of the test is to alert us if there is some change in newer distros which
+# affects this logic.
+#
+
+import json
+from typing import List, Dict, Any
+
+from assertpy import fail, assert_that
+from time import sleep
+
+from tests_e2e.tests.lib.agent_test import AgentVmTest, TestSkipped
+from tests_e2e.tests.lib.agent_test_context import AgentVmTestContext
+from tests_e2e.tests.lib.logging import log
+from tests_e2e.tests.lib.virtual_machine_extension_client import VirtualMachineExtensionClient
+from tests_e2e.tests.lib.vm_extension_identifier import VmExtensionIds
+
+
+class RecoverNetworkInterface(AgentVmTest):
+    def __init__(self, context: AgentVmTestContext):
+        super().__init__(context)
+        self._context = context
+        self._ssh_client = context.create_ssh_client()
+        self._private_ip = context.vm.get_private_ip_address()
+        self._vm_password = ""
+
+    def add_vm_password(self):
+        # Add password to VM to help with debugging in case of failure
+        # REMOVE PWD FROM LOGS IF WE EVER MAKE THESE RUNS/LOGS PUBLIC
+        username = self._ssh_client.username
+        pwd = self._ssh_client.run_command("openssl rand -base64 32 | tr : .").rstrip()
+        self._vm_password = pwd
+        log.info("VM Username: {0}; VM Password: {1}".format(username, pwd))
+        self._ssh_client.run_command("echo '{0}:{1}' | sudo -S chpasswd".format(username, pwd))
+
+    def check_agent_reports_status(self):
+        status_updated = False
+        last_agent_status_time = self._context.vm.get_instance_view().vm_agent.statuses[0].time
+        log.info("Agent reported status at {0}".format(last_agent_status_time))
+        retries = 3
+
+        while retries > 0 and not status_updated:
+            agent_status_time = self._context.vm.get_instance_view().vm_agent.statuses[0].time
+            if agent_status_time != last_agent_status_time:
+                status_updated = True
+                log.info("Agent reported status at {0}".format(last_agent_status_time))
+            else:
+                retries -= 1
+                sleep(60)
+
+        if not status_updated:
+            fail("Agent hasn't reported status since {0} and ssh connection failed. Use the serial console in portal "
+                 "to debug".format(last_agent_status_time))
+
+    def run(self):
+        # Add password to VM and log. This allows us to debug with serial console if necessary
+        log.info("")
+        log.info("Adding password to the VM to use for debugging in case necessary...")
+        self.add_vm_password()
+
+        # Skip the test if NM_CONTROLLED=n. The current recover logic does not work in this case
+        result = self._ssh_client.run_command("recover_network_interface-get_nm_controlled.py", use_sudo=True)
+        if "Interface is NOT NM controlled" in result:
+            raise TestSkipped("Current recover method will not work on interfaces where NM_Controlled=n")
+
+        # Get the primary network interface name
+        ifname = self._ssh_client.run_command("pypy3 -c 'from azurelinuxagent.common.osutil.redhat import RedhatOSUtil; print(RedhatOSUtil().get_if_name())'").rstrip()
+        # The interface name needs to be in double quotes for the pypy portion of the script
+        formatted_ifname = f'"{ifname}"'
+
+        # The script should bring the primary network interface down and use the agent to recover the interface. These
+        # commands will bring the network down, so they should be executed on the machine using CSE instead of ssh.
+        script = f"""
+        set -euxo pipefail
+        ifdown {ifname};
+        nic_state=$(nmcli -g general.state device show {ifname})
+        echo Primary network interface state before recovering: $nic_state
+        source /home/{self._context.username}/bin/set-agent-env;
+        pypy3 -c 'from azurelinuxagent.common.osutil.redhat import RedhatOSUtil; RedhatOSUtil().check_and_recover_nic_state({formatted_ifname})';
+        nic_state=$(nmcli -g general.state device show {ifname});
+        echo Primary network interface state after recovering: $nic_state
+        """
+        log.info("")
+        log.info("Using CSE to bring the primary network interface down and call the OSUtil to bring the interface back up. Command to execute: {0}".format(script))
+        custom_script = VirtualMachineExtensionClient(self._context.vm, VmExtensionIds.CustomScript, resource_name="CustomScript")
+        custom_script.enable(
+            protected_settings={
+                'commandToExecute': script
+            }
+        )
+
+        # Check that the interface was down and brought back up in instance view
+        log.info("")
+        log.info("Checking the instance view to confirm the primary network interface was brought down and successfully recovered by the agent...")
+        instance_view = custom_script.get_instance_view()
+        log.info("Instance view for custom script after enable is: {0}".format(json.dumps(instance_view.serialize(), indent=4)))
+        assert_that(len(instance_view.statuses)).described_as("Instance view should have a status for CustomScript").is_greater_than(0)
+        assert_that(instance_view.statuses[0].message).described_as("The primary network interface should be in a disconnected state before the attempt to recover").contains("Primary network interface state before recovering: 30 (disconnected)")
+        assert_that(instance_view.statuses[0].message).described_as("The primary network interface should be in a connected state after the attempt to recover").contains("Primary network interface state after recovering: 100 (connected)")
+
+        # Check that the agent is successfully reporting status after recovering the network
+        log.info("")
+        log.info("Checking that the agent is reporting status after recovering the network...")
+        self.check_agent_reports_status()
+
+        log.info("")
+        log.info("The primary network interface was successfully recovered by the agent.")
+
+    def get_ignore_error_rules(self) -> List[Dict[str, Any]]:
+        ignore_rules = [
+            #
+            # We may see temporary network unreachable warnings since we are bringing the network interface down
+            # 2024-02-01T23:40:03.563499Z ERROR ExtHandler ExtHandler Error fetching the goal state: [ProtocolError] GET vmSettings [correlation ID: ac21bdd7-1a7a-4bba-b307-b9d5bc30da33 eTag: 941323814975149980]: Request failed: [Errno 101] Network is unreachable
+            #
+            {
+                'message': r"Error fetching the goal state: \[ProtocolError\] GET vmSettings.*Request failed: \[Errno 101\] Network is unreachable"
+            }
+        ]
+        return ignore_rules
+
+
+if __name__ == "__main__":
+    RecoverNetworkInterface.run_from_command_line()