Merge branch 'develop' into agent_status_scenario

Azure · Aug 21, 2023 · fc394af · fc394af
2 parents 2e1f887 + fd2a181
commit fc394af
Show file tree

Hide file tree

Showing 4 changed files with 177 additions and 69 deletions.
diff --git a/azurelinuxagent/ga/exthandlers.py b/azurelinuxagent/ga/exthandlers.py
@@ -33,6 +33,7 @@
 
 from azurelinuxagent.common import conf
 from azurelinuxagent.common import logger
+from azurelinuxagent.common.osutil import get_osutil
 from azurelinuxagent.common.utils import fileutil
 from azurelinuxagent.common import version
 from azurelinuxagent.common.agent_supported_feature import get_agent_supported_features_list_for_extensions, \
@@ -299,7 +300,7 @@ def run(self):
             # we make a deep copy of the extensions, since changes are made to self.ext_handlers while processing the extensions
             self.ext_handlers = copy.deepcopy(egs.extensions)
 
-            if not self._extension_processing_allowed():
+            if self._extensions_on_hold():
                 return
 
             utc_start = datetime.datetime.utcnow()
@@ -433,17 +434,15 @@ def _cleanup_outdated_handlers(self):
                 except OSError as e:
                     logger.warn("Failed to remove extension package {0}: {1}".format(pkg, e.strerror))
 
-    def _extension_processing_allowed(self):
-        if not conf.get_extensions_enabled():
-            logger.verbose("Extension handling is disabled")
-            return False
-
+    def _extensions_on_hold(self):
         if conf.get_enable_overprovisioning():
             if self.protocol.get_goal_state().extensions_goal_state.on_hold:
-                logger.info("Extension handling is on hold")
-                return False
+                msg = "Extension handling is on hold"
+                logger.info(msg)
+                add_event(op=WALAEventOperation.ExtensionProcessing, message=msg)
+                return True
 
-        return True
+        return False
 
     @staticmethod
     def __get_dependency_level(tup):
@@ -478,10 +477,30 @@ def handle_ext_handlers(self, goal_state_id):
         max_dep_level = self.__get_dependency_level(all_extensions[-1]) if any(all_extensions) else 0
 
         depends_on_err_msg = None
+        extensions_enabled = conf.get_extensions_enabled()
         for extension, ext_handler in all_extensions:
 
             handler_i = ExtHandlerInstance(ext_handler, self.protocol, extension=extension)
 
+            # In case of extensions disabled, we skip processing extensions. But CRP is still waiting for some status
+            # back for the skipped extensions. In order to propagate the status back to CRP, we will report status back
+            # here with an error message.
+            if not extensions_enabled:
+                agent_conf_file_path = get_osutil().agent_conf_file_path
+                msg = "Extension will not be processed since extension processing is disabled. To enable extension " \
+                      "processing, set Extensions.Enabled=y in '{0}'".format(agent_conf_file_path)
+                ext_full_name = handler_i.get_extension_full_name(extension)
+                logger.info('')
+                logger.info("{0}: {1}".format(ext_full_name, msg))
+                add_event(op=WALAEventOperation.ExtensionProcessing, message="{0}: {1}".format(ext_full_name, msg))
+                handler_i.set_handler_status(status=ExtHandlerStatusValue.not_ready, message=msg, code=-1)
+                handler_i.create_status_file_if_not_exist(extension,
+                                                          status=ExtensionStatusValue.error,
+                                                          code=-1,
+                                                          operation=handler_i.operation,
+                                                          message=msg)
+                continue
+
             # In case of depends-on errors, we skip processing extensions if there was an error processing dependent extensions.
             # But CRP is still waiting for some status back for the skipped extensions. In order to propagate the status back to CRP,
             # we will report status back here with the relevant error message for each of the dependent extension.
@@ -991,7 +1010,9 @@ def report_ext_handler_status(self, vm_status, ext_handler, goal_state_changed):
         # For MultiConfig, we need to report status per extension even for Handler level failures.
         # If we have HandlerStatus for a MultiConfig handler and GS is requesting for it, we would report status per
         # extension even if HandlerState == NotInstalled (Sample scenario: ExtensionsGoalStateError, DecideVersionError, etc)
-        if handler_state != ExtHandlerState.NotInstalled or ext_handler.supports_multi_config:
+        # We also need to report extension status for an uninstalled handler if extensions are disabled because CRP
+        # waits for extension runtime status before failing the extension operation.
+        if handler_state != ExtHandlerState.NotInstalled or ext_handler.supports_multi_config or not conf.get_extensions_enabled():
 
             # Since we require reading the Manifest for reading the heartbeat, this would fail if HandlerManifest not found.
             # Only try to read heartbeat if HandlerState != NotInstalled.

diff --git a/tests/ga/test_extension.py b/tests/ga/test_extension.py
@@ -1649,7 +1649,19 @@ def test_extensions_disabled(self, _, *args):
         exthandlers_handler.run()
         exthandlers_handler.report_ext_handlers_status()
 
-        self._assert_no_handler_status(protocol.report_vm_status)
+        report_vm_status = protocol.report_vm_status
+        self.assertTrue(report_vm_status.called)
+        args, kw = report_vm_status.call_args  # pylint: disable=unused-variable
+        vm_status = args[0]
+        self.assertEqual(1, len(vm_status.vmAgent.extensionHandlers))
+        exthandler = vm_status.vmAgent.extensionHandlers[0]
+        self.assertEqual(-1, exthandler.code)
+        self.assertEqual('NotReady', exthandler.status)
+        self.assertEqual("Extension will not be processed since extension processing is disabled. To enable extension processing, set Extensions.Enabled=y in '/etc/waagent.conf'", exthandler.message)
+        ext_status = exthandler.extension_status
+        self.assertEqual(-1, ext_status.code)
+        self.assertEqual('error', ext_status.status)
+        self.assertEqual("Extension will not be processed since extension processing is disabled. To enable extension processing, set Extensions.Enabled=y in '/etc/waagent.conf'", ext_status.message)
 
     def test_extensions_deleted(self, *args):
         # Ensure initial enable is successful

diff --git a/tests_e2e/orchestrator/lib/agent_test_suite.py b/tests_e2e/orchestrator/lib/agent_test_suite.py
@@ -52,7 +52,7 @@
 from tests_e2e.tests.lib.logging import log
 from tests_e2e.tests.lib.logging import set_current_thread_log
 from tests_e2e.tests.lib.agent_log import AgentLogRecord
-from tests_e2e.tests.lib.shell import run_command
+from tests_e2e.tests.lib.shell import run_command, CommandError
 from tests_e2e.tests.lib.ssh_client import SshClient
 
 
@@ -304,29 +304,6 @@ def _setup_node(self, install_test_agent: bool) -> None:
             log.info("Downloading %s to %s", pypy_download, pypy_path)
             run_command(["wget", pypy_download, "-O",  pypy_path])
 
-        #
-        # Create a tarball with the files we need to copy to the test node. The tarball includes two directories:
-        #
-        #     * bin - Executables file (Bash and Python scripts)
-        #     * lib - Library files (Python modules)
-        #
-        # After extracting the tarball on the test node, 'bin' will be added to PATH and PYTHONPATH will be set to 'lib'.
-        #
-        # Note that executables are placed directly under 'bin', while the path for Python modules is preserved under 'lib.
-        #
-        tarball_path: Path = Path("/tmp/waagent.tar")
-        log.info("Creating %s with the files need on the test node", tarball_path)
-        log.info("Adding orchestrator/scripts")
-        command = "cd {0} ; tar cvf {1} --transform='s,^,bin/,' *".format(self.context.test_source_directory/"orchestrator"/"scripts", str(tarball_path))
-        log.info("%s\n%s", command, run_command(command, shell=True))
-        log.info("Adding tests/scripts")
-        command = "cd {0} ; tar rvf {1} --transform='s,^,bin/,' *".format(self.context.test_source_directory/"tests"/"scripts", str(tarball_path))
-        log.info("%s\n%s", command, run_command(command, shell=True))
-        log.info("Adding tests/lib")
-        command = "cd {0} ; tar rvf {1} --transform='s,^,lib/,' --exclude=__pycache__ tests_e2e/tests/lib".format(self.context.test_source_directory.parent, str(tarball_path))
-        log.info("%s\n%s", command, run_command(command, shell=True))
-        log.info("Contents of %s:\n\n%s", tarball_path, run_command(['tar', 'tvf', str(tarball_path)]))
-
         #
         # Cleanup the test node (useful for developer runs)
         #
@@ -335,24 +312,70 @@ def _setup_node(self, install_test_agent: bool) -> None:
         self.context.ssh_client.run_command("rm -rvf ~/{bin,lib,tmp}", use_sudo=True)
 
         #
-        # Copy the tarball, Pypy and the test Agent to the test node
+        # Copy Pypy and the test Agent to the test node
         #
         target_path = Path("~")/"tmp"
         self.context.ssh_client.run_command(f"mkdir {target_path}")
-        log.info("Copying %s to %s:%s", tarball_path, self.context.node.name, target_path)
-        self.context.ssh_client.copy_to_node(tarball_path, target_path)
         log.info("Copying %s to %s:%s", pypy_path, self.context.node.name, target_path)
         self.context.ssh_client.copy_to_node(pypy_path, target_path)
         agent_package_path: Path = self._get_agent_package_path()
         log.info("Copying %s to %s:%s", agent_package_path, self.context.node.name, target_path)
         self.context.ssh_client.copy_to_node(agent_package_path, target_path)
 
-        #
-        # Extract the tarball and execute the install scripts
-        #
-        log.info('Installing tools on the test node')
-        command = f"tar xvf {target_path/tarball_path.name} && ~/bin/install-tools"
-        log.info("Remote command [%s] completed:\n%s", command, self.context.ssh_client.run_command(command))
+        # tar commands sometimes fail with 'tar: Unexpected EOF in archive' error. Retry tarball creation, copy, and
+        # extraction if we hit this error
+        tar_retries = 3
+        while tar_retries > 0:
+            try:
+                #
+                # Create a tarball with the files we need to copy to the test node. The tarball includes two directories:
+                #
+                #     * bin - Executables file (Bash and Python scripts)
+                #     * lib - Library files (Python modules)
+                #
+                # After extracting the tarball on the test node, 'bin' will be added to PATH and PYTHONPATH will be set to 'lib'.
+                #
+                # Note that executables are placed directly under 'bin', while the path for Python modules is preserved under 'lib.
+                #
+                tarball_path: Path = Path("/tmp/waagent.tar")
+                log.info("Creating %s with the files need on the test node", tarball_path)
+                log.info("Adding orchestrator/scripts")
+                command = "cd {0} ; tar cvf {1} --transform='s,^,bin/,' *".format(self.context.test_source_directory/"orchestrator"/"scripts", str(tarball_path))
+                log.info("%s\n%s", command, run_command(command, shell=True))
+                log.info("Adding tests/scripts")
+                command = "cd {0} ; tar rvf {1} --transform='s,^,bin/,' *".format(self.context.test_source_directory/"tests"/"scripts", str(tarball_path))
+                log.info("%s\n%s", command, run_command(command, shell=True))
+                log.info("Adding tests/lib")
+                command = "cd {0} ; tar rvf {1} --transform='s,^,lib/,' --exclude=__pycache__ tests_e2e/tests/lib".format(self.context.test_source_directory.parent, str(tarball_path))
+                log.info("%s\n%s", command, run_command(command, shell=True))
+                log.info("Contents of %s:\n\n%s", tarball_path, run_command(['tar', 'tvf', str(tarball_path)]))
+
+                #
+                # Copy the tarball to the test node
+                #
+                log.info("Copying %s to %s:%s", tarball_path, self.context.node.name, target_path)
+                self.context.ssh_client.copy_to_node(tarball_path, target_path)
+
+                #
+                # Extract the tarball and execute the install scripts
+                #
+                log.info('Installing tools on the test node')
+                command = f"tar xvf {target_path/tarball_path.name} && ~/bin/install-tools"
+                log.info("Remote command [%s] completed:\n%s", command, self.context.ssh_client.run_command(command))
+
+                # Tarball creation and extraction was successful - no need to retry
+                tar_retries = 0
+
+            except CommandError as error:
+                if "tar: Unexpected EOF in archive" in error.stderr:
+                    tar_retries -= 1
+                    # Log the error with traceback to see which tar operation failed
+                    log.info(f"Tarball creation or extraction failed: \n{error}")
+                    # Retry tar operations
+                    if tar_retries > 0:
+                        log.info("Retrying tarball creation and extraction...")
+                else:
+                    raise Exception(f"Unexpected error when creating or extracting tarball during node setup: {error}")
 
         if self.context.is_vhd:
             log.info("Using a VHD; will not install the Test Agent.")

diff --git a/tests_e2e/tests/extensions_disabled/extensions_disabled.py b/tests_e2e/tests/extensions_disabled/extensions_disabled.py
@@ -24,9 +24,10 @@
 
 import datetime
 import pytz
+import uuid
 
 from assertpy import assert_that, fail
-from typing import Any, Dict, List
+from typing import Any
 
 from azure.mgmt.compute.models import VirtualMachineInstanceView
 
@@ -39,39 +40,77 @@
 
 
 class ExtensionsDisabled(AgentTest):
+    class TestCase:
+        def __init__(self, extension: VirtualMachineExtensionClient, settings: Any):
+            self.extension = extension
+            self.settings = settings
+
     def run(self):
         ssh_client: SshClient = self._context.create_ssh_client()
 
         # Disable extension processing on the test VM
+        log.info("")
         log.info("Disabling extension processing on the test VM [%s]", self._context.vm.name)
         output = ssh_client.run_command("update-waagent-conf Extensions.Enabled=n", use_sudo=True)
         log.info("Disable completed:\n%s", output)
-
-        # From now on, extensions will time out; set the timeout to the minimum allowed(15 minutes)
-        log.info("Setting the extension timeout to 15 minutes")
-        vm: VirtualMachineClient = VirtualMachineClient(self._context.vm)
-
-        vm.update({"extensionsTimeBudget": "PT15M"})
-
         disabled_timestamp: datetime.datetime = datetime.datetime.utcnow() - datetime.timedelta(minutes=60)
 
-        #
-        # Validate that the agent is not processing extensions by attempting to run CustomScript
-        #
-        log.info("Executing CustomScript; it should time out after 15 min or so.")
-        custom_script = VirtualMachineExtensionClient(self._context.vm, VmExtensionIds.CustomScript, resource_name="CustomScript")
-        try:
-            custom_script.enable(settings={'commandToExecute': "date"}, force_update=True, timeout=20 * 60)
-            fail("CustomScript should have timed out")
-        except Exception as error:
-            assert_that("VMExtensionProvisioningTimeout" in str(error)) \
-                .described_as(f"Expected a VMExtensionProvisioningTimeout: {error}") \
-                .is_true()
-            log.info("CustomScript timed out as expected")
+        # Prepare test cases
+        unique = str(uuid.uuid4())
+        test_file = f"waagent-test.{unique}"
+        test_cases = [
+            ExtensionsDisabled.TestCase(
+                VirtualMachineExtensionClient(self._context.vm, VmExtensionIds.CustomScript,
+                                              resource_name="CustomScript"),
+                {'commandToExecute': f"echo '{unique}' > /tmp/{test_file}"}
+            ),
+            ExtensionsDisabled.TestCase(
+                VirtualMachineExtensionClient(self._context.vm, VmExtensionIds.RunCommandHandler,
+                                              resource_name="RunCommandHandler"),
+                {'source': {'script': f"echo '{unique}' > /tmp/{test_file}"}}
+            )
+        ]
+
+        for t in test_cases:
+            log.info("")
+            log.info("Test case: %s", t.extension)
+            #
+            # Validate that the agent is not processing extensions by attempting to enable extension & checking that
+            # provisioning fails fast
+            #
+            log.info(
+                "Executing {0}; the agent should report a VMExtensionProvisioningError without processing the extension"
+                .format(t.extension.__str__()))
+
+            try:
+                t.extension.enable(settings=t.settings, force_update=True, timeout=6 * 60)
+                fail("The agent should have reported an error processing the goal state")
+            except Exception as error:
+                assert_that("VMExtensionProvisioningError" in str(error)) \
+                    .described_as(f"Expected a VMExtensionProvisioningError error, but actual error was: {error}") \
+                    .is_true()
+                assert_that("Extension will not be processed since extension processing is disabled" in str(error)) \
+                    .described_as(
+                    f"Error message should communicate that extension will not be processed, but actual error "
+                    f"was: {error}").is_true()
+                log.info("Goal state processing for {0} failed as expected".format(t.extension.__str__()))
+
+            #
+            # Validate the agent did not process the extension by checking it did not execute the extension settings
+            #
+            output = ssh_client.run_command("dir /tmp", use_sudo=True)
+            assert_that(output) \
+                .described_as(
+                f"Contents of '/tmp' on test VM contains {test_file}. Contents: {output}. \n This indicates "
+                f"{t.extension.__str__()} was unexpectedly processed") \
+                .does_not_contain(f"{test_file}")
+            log.info("The agent did not process the extension settings for {0} as expected".format(t.extension.__str__()))
 
         #
         # Validate that the agent continued reporting status even if it is not processing extensions
         #
+        vm: VirtualMachineClient = VirtualMachineClient(self._context.vm)
+        log.info("")
         instance_view: VirtualMachineInstanceView = vm.get_instance_view()
         log.info("Instance view of VM Agent:\n%s", instance_view.vm_agent.serialize())
         assert_that(instance_view.vm_agent.statuses).described_as("The VM agent should have exactly 1 status").is_length(1)
@@ -82,10 +121,23 @@ def run(self):
             .is_greater_than(pytz.utc.localize(disabled_timestamp))
         log.info("The VM Agent reported status after extensions were disabled, as expected.")
 
-    def get_ignore_error_rules(self) -> List[Dict[str, Any]]:
-        return [
-            {'message': 'No handler status found for Microsoft.Azure.Extensions.CustomScript'},
-        ]
+        #
+        # Validate that the agent processes extensions after re-enabling extension processing
+        #
+        log.info("")
+        log.info("Enabling extension processing on the test VM [%s]", self._context.vm.name)
+        output = ssh_client.run_command("update-waagent-conf Extensions.Enabled=y", use_sudo=True)
+        log.info("Enable completed:\n%s", output)
+
+        for t in test_cases:
+            try:
+                log.info("")
+                log.info("Executing {0}; the agent should process the extension".format(t.extension.__str__()))
+                t.extension.enable(settings=t.settings, force_update=True, timeout=15 * 60)
+                log.info("Goal state processing for {0} succeeded as expected".format(t.extension.__str__()))
+            except Exception as error:
+                fail(f"Unexpected error while processing {t.extension.__str__()} after re-enabling extension "
+                     f"processing: {error}")
 
 
 if __name__ == "__main__":