Skip to content

Commit

Permalink
Merge branch 'develop' into agent_status_scenario
Browse files Browse the repository at this point in the history
  • Loading branch information
maddieford committed Aug 21, 2023
2 parents 2e1f887 + fd2a181 commit fc394af
Show file tree
Hide file tree
Showing 4 changed files with 177 additions and 69 deletions.
41 changes: 31 additions & 10 deletions azurelinuxagent/ga/exthandlers.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@

from azurelinuxagent.common import conf
from azurelinuxagent.common import logger
from azurelinuxagent.common.osutil import get_osutil
from azurelinuxagent.common.utils import fileutil
from azurelinuxagent.common import version
from azurelinuxagent.common.agent_supported_feature import get_agent_supported_features_list_for_extensions, \
Expand Down Expand Up @@ -299,7 +300,7 @@ def run(self):
# we make a deep copy of the extensions, since changes are made to self.ext_handlers while processing the extensions
self.ext_handlers = copy.deepcopy(egs.extensions)

if not self._extension_processing_allowed():
if self._extensions_on_hold():
return

utc_start = datetime.datetime.utcnow()
Expand Down Expand Up @@ -433,17 +434,15 @@ def _cleanup_outdated_handlers(self):
except OSError as e:
logger.warn("Failed to remove extension package {0}: {1}".format(pkg, e.strerror))

def _extension_processing_allowed(self):
if not conf.get_extensions_enabled():
logger.verbose("Extension handling is disabled")
return False

def _extensions_on_hold(self):
if conf.get_enable_overprovisioning():
if self.protocol.get_goal_state().extensions_goal_state.on_hold:
logger.info("Extension handling is on hold")
return False
msg = "Extension handling is on hold"
logger.info(msg)
add_event(op=WALAEventOperation.ExtensionProcessing, message=msg)
return True

return True
return False

@staticmethod
def __get_dependency_level(tup):
Expand Down Expand Up @@ -478,10 +477,30 @@ def handle_ext_handlers(self, goal_state_id):
max_dep_level = self.__get_dependency_level(all_extensions[-1]) if any(all_extensions) else 0

depends_on_err_msg = None
extensions_enabled = conf.get_extensions_enabled()
for extension, ext_handler in all_extensions:

handler_i = ExtHandlerInstance(ext_handler, self.protocol, extension=extension)

# In case of extensions disabled, we skip processing extensions. But CRP is still waiting for some status
# back for the skipped extensions. In order to propagate the status back to CRP, we will report status back
# here with an error message.
if not extensions_enabled:
agent_conf_file_path = get_osutil().agent_conf_file_path
msg = "Extension will not be processed since extension processing is disabled. To enable extension " \
"processing, set Extensions.Enabled=y in '{0}'".format(agent_conf_file_path)
ext_full_name = handler_i.get_extension_full_name(extension)
logger.info('')
logger.info("{0}: {1}".format(ext_full_name, msg))
add_event(op=WALAEventOperation.ExtensionProcessing, message="{0}: {1}".format(ext_full_name, msg))
handler_i.set_handler_status(status=ExtHandlerStatusValue.not_ready, message=msg, code=-1)
handler_i.create_status_file_if_not_exist(extension,
status=ExtensionStatusValue.error,
code=-1,
operation=handler_i.operation,
message=msg)
continue

# In case of depends-on errors, we skip processing extensions if there was an error processing dependent extensions.
# But CRP is still waiting for some status back for the skipped extensions. In order to propagate the status back to CRP,
# we will report status back here with the relevant error message for each of the dependent extension.
Expand Down Expand Up @@ -991,7 +1010,9 @@ def report_ext_handler_status(self, vm_status, ext_handler, goal_state_changed):
# For MultiConfig, we need to report status per extension even for Handler level failures.
# If we have HandlerStatus for a MultiConfig handler and GS is requesting for it, we would report status per
# extension even if HandlerState == NotInstalled (Sample scenario: ExtensionsGoalStateError, DecideVersionError, etc)
if handler_state != ExtHandlerState.NotInstalled or ext_handler.supports_multi_config:
# We also need to report extension status for an uninstalled handler if extensions are disabled because CRP
# waits for extension runtime status before failing the extension operation.
if handler_state != ExtHandlerState.NotInstalled or ext_handler.supports_multi_config or not conf.get_extensions_enabled():

# Since we require reading the Manifest for reading the heartbeat, this would fail if HandlerManifest not found.
# Only try to read heartbeat if HandlerState != NotInstalled.
Expand Down
14 changes: 13 additions & 1 deletion tests/ga/test_extension.py
Original file line number Diff line number Diff line change
Expand Up @@ -1649,7 +1649,19 @@ def test_extensions_disabled(self, _, *args):
exthandlers_handler.run()
exthandlers_handler.report_ext_handlers_status()

self._assert_no_handler_status(protocol.report_vm_status)
report_vm_status = protocol.report_vm_status
self.assertTrue(report_vm_status.called)
args, kw = report_vm_status.call_args # pylint: disable=unused-variable
vm_status = args[0]
self.assertEqual(1, len(vm_status.vmAgent.extensionHandlers))
exthandler = vm_status.vmAgent.extensionHandlers[0]
self.assertEqual(-1, exthandler.code)
self.assertEqual('NotReady', exthandler.status)
self.assertEqual("Extension will not be processed since extension processing is disabled. To enable extension processing, set Extensions.Enabled=y in '/etc/waagent.conf'", exthandler.message)
ext_status = exthandler.extension_status
self.assertEqual(-1, ext_status.code)
self.assertEqual('error', ext_status.status)
self.assertEqual("Extension will not be processed since extension processing is disabled. To enable extension processing, set Extensions.Enabled=y in '/etc/waagent.conf'", ext_status.message)

def test_extensions_deleted(self, *args):
# Ensure initial enable is successful
Expand Down
89 changes: 56 additions & 33 deletions tests_e2e/orchestrator/lib/agent_test_suite.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@
from tests_e2e.tests.lib.logging import log
from tests_e2e.tests.lib.logging import set_current_thread_log
from tests_e2e.tests.lib.agent_log import AgentLogRecord
from tests_e2e.tests.lib.shell import run_command
from tests_e2e.tests.lib.shell import run_command, CommandError
from tests_e2e.tests.lib.ssh_client import SshClient


Expand Down Expand Up @@ -304,29 +304,6 @@ def _setup_node(self, install_test_agent: bool) -> None:
log.info("Downloading %s to %s", pypy_download, pypy_path)
run_command(["wget", pypy_download, "-O", pypy_path])

#
# Create a tarball with the files we need to copy to the test node. The tarball includes two directories:
#
# * bin - Executables file (Bash and Python scripts)
# * lib - Library files (Python modules)
#
# After extracting the tarball on the test node, 'bin' will be added to PATH and PYTHONPATH will be set to 'lib'.
#
# Note that executables are placed directly under 'bin', while the path for Python modules is preserved under 'lib.
#
tarball_path: Path = Path("/tmp/waagent.tar")
log.info("Creating %s with the files need on the test node", tarball_path)
log.info("Adding orchestrator/scripts")
command = "cd {0} ; tar cvf {1} --transform='s,^,bin/,' *".format(self.context.test_source_directory/"orchestrator"/"scripts", str(tarball_path))
log.info("%s\n%s", command, run_command(command, shell=True))
log.info("Adding tests/scripts")
command = "cd {0} ; tar rvf {1} --transform='s,^,bin/,' *".format(self.context.test_source_directory/"tests"/"scripts", str(tarball_path))
log.info("%s\n%s", command, run_command(command, shell=True))
log.info("Adding tests/lib")
command = "cd {0} ; tar rvf {1} --transform='s,^,lib/,' --exclude=__pycache__ tests_e2e/tests/lib".format(self.context.test_source_directory.parent, str(tarball_path))
log.info("%s\n%s", command, run_command(command, shell=True))
log.info("Contents of %s:\n\n%s", tarball_path, run_command(['tar', 'tvf', str(tarball_path)]))

#
# Cleanup the test node (useful for developer runs)
#
Expand All @@ -335,24 +312,70 @@ def _setup_node(self, install_test_agent: bool) -> None:
self.context.ssh_client.run_command("rm -rvf ~/{bin,lib,tmp}", use_sudo=True)

#
# Copy the tarball, Pypy and the test Agent to the test node
# Copy Pypy and the test Agent to the test node
#
target_path = Path("~")/"tmp"
self.context.ssh_client.run_command(f"mkdir {target_path}")
log.info("Copying %s to %s:%s", tarball_path, self.context.node.name, target_path)
self.context.ssh_client.copy_to_node(tarball_path, target_path)
log.info("Copying %s to %s:%s", pypy_path, self.context.node.name, target_path)
self.context.ssh_client.copy_to_node(pypy_path, target_path)
agent_package_path: Path = self._get_agent_package_path()
log.info("Copying %s to %s:%s", agent_package_path, self.context.node.name, target_path)
self.context.ssh_client.copy_to_node(agent_package_path, target_path)

#
# Extract the tarball and execute the install scripts
#
log.info('Installing tools on the test node')
command = f"tar xvf {target_path/tarball_path.name} && ~/bin/install-tools"
log.info("Remote command [%s] completed:\n%s", command, self.context.ssh_client.run_command(command))
# tar commands sometimes fail with 'tar: Unexpected EOF in archive' error. Retry tarball creation, copy, and
# extraction if we hit this error
tar_retries = 3
while tar_retries > 0:
try:
#
# Create a tarball with the files we need to copy to the test node. The tarball includes two directories:
#
# * bin - Executables file (Bash and Python scripts)
# * lib - Library files (Python modules)
#
# After extracting the tarball on the test node, 'bin' will be added to PATH and PYTHONPATH will be set to 'lib'.
#
# Note that executables are placed directly under 'bin', while the path for Python modules is preserved under 'lib.
#
tarball_path: Path = Path("/tmp/waagent.tar")
log.info("Creating %s with the files need on the test node", tarball_path)
log.info("Adding orchestrator/scripts")
command = "cd {0} ; tar cvf {1} --transform='s,^,bin/,' *".format(self.context.test_source_directory/"orchestrator"/"scripts", str(tarball_path))
log.info("%s\n%s", command, run_command(command, shell=True))
log.info("Adding tests/scripts")
command = "cd {0} ; tar rvf {1} --transform='s,^,bin/,' *".format(self.context.test_source_directory/"tests"/"scripts", str(tarball_path))
log.info("%s\n%s", command, run_command(command, shell=True))
log.info("Adding tests/lib")
command = "cd {0} ; tar rvf {1} --transform='s,^,lib/,' --exclude=__pycache__ tests_e2e/tests/lib".format(self.context.test_source_directory.parent, str(tarball_path))
log.info("%s\n%s", command, run_command(command, shell=True))
log.info("Contents of %s:\n\n%s", tarball_path, run_command(['tar', 'tvf', str(tarball_path)]))

#
# Copy the tarball to the test node
#
log.info("Copying %s to %s:%s", tarball_path, self.context.node.name, target_path)
self.context.ssh_client.copy_to_node(tarball_path, target_path)

#
# Extract the tarball and execute the install scripts
#
log.info('Installing tools on the test node')
command = f"tar xvf {target_path/tarball_path.name} && ~/bin/install-tools"
log.info("Remote command [%s] completed:\n%s", command, self.context.ssh_client.run_command(command))

# Tarball creation and extraction was successful - no need to retry
tar_retries = 0

except CommandError as error:
if "tar: Unexpected EOF in archive" in error.stderr:
tar_retries -= 1
# Log the error with traceback to see which tar operation failed
log.info(f"Tarball creation or extraction failed: \n{error}")
# Retry tar operations
if tar_retries > 0:
log.info("Retrying tarball creation and extraction...")
else:
raise Exception(f"Unexpected error when creating or extracting tarball during node setup: {error}")

if self.context.is_vhd:
log.info("Using a VHD; will not install the Test Agent.")
Expand Down
102 changes: 77 additions & 25 deletions tests_e2e/tests/extensions_disabled/extensions_disabled.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,9 +24,10 @@

import datetime
import pytz
import uuid

from assertpy import assert_that, fail
from typing import Any, Dict, List
from typing import Any

from azure.mgmt.compute.models import VirtualMachineInstanceView

Expand All @@ -39,39 +40,77 @@


class ExtensionsDisabled(AgentTest):
class TestCase:
def __init__(self, extension: VirtualMachineExtensionClient, settings: Any):
self.extension = extension
self.settings = settings

def run(self):
ssh_client: SshClient = self._context.create_ssh_client()

# Disable extension processing on the test VM
log.info("")
log.info("Disabling extension processing on the test VM [%s]", self._context.vm.name)
output = ssh_client.run_command("update-waagent-conf Extensions.Enabled=n", use_sudo=True)
log.info("Disable completed:\n%s", output)

# From now on, extensions will time out; set the timeout to the minimum allowed(15 minutes)
log.info("Setting the extension timeout to 15 minutes")
vm: VirtualMachineClient = VirtualMachineClient(self._context.vm)

vm.update({"extensionsTimeBudget": "PT15M"})

disabled_timestamp: datetime.datetime = datetime.datetime.utcnow() - datetime.timedelta(minutes=60)

#
# Validate that the agent is not processing extensions by attempting to run CustomScript
#
log.info("Executing CustomScript; it should time out after 15 min or so.")
custom_script = VirtualMachineExtensionClient(self._context.vm, VmExtensionIds.CustomScript, resource_name="CustomScript")
try:
custom_script.enable(settings={'commandToExecute': "date"}, force_update=True, timeout=20 * 60)
fail("CustomScript should have timed out")
except Exception as error:
assert_that("VMExtensionProvisioningTimeout" in str(error)) \
.described_as(f"Expected a VMExtensionProvisioningTimeout: {error}") \
.is_true()
log.info("CustomScript timed out as expected")
# Prepare test cases
unique = str(uuid.uuid4())
test_file = f"waagent-test.{unique}"
test_cases = [
ExtensionsDisabled.TestCase(
VirtualMachineExtensionClient(self._context.vm, VmExtensionIds.CustomScript,
resource_name="CustomScript"),
{'commandToExecute': f"echo '{unique}' > /tmp/{test_file}"}
),
ExtensionsDisabled.TestCase(
VirtualMachineExtensionClient(self._context.vm, VmExtensionIds.RunCommandHandler,
resource_name="RunCommandHandler"),
{'source': {'script': f"echo '{unique}' > /tmp/{test_file}"}}
)
]

for t in test_cases:
log.info("")
log.info("Test case: %s", t.extension)
#
# Validate that the agent is not processing extensions by attempting to enable extension & checking that
# provisioning fails fast
#
log.info(
"Executing {0}; the agent should report a VMExtensionProvisioningError without processing the extension"
.format(t.extension.__str__()))

try:
t.extension.enable(settings=t.settings, force_update=True, timeout=6 * 60)
fail("The agent should have reported an error processing the goal state")
except Exception as error:
assert_that("VMExtensionProvisioningError" in str(error)) \
.described_as(f"Expected a VMExtensionProvisioningError error, but actual error was: {error}") \
.is_true()
assert_that("Extension will not be processed since extension processing is disabled" in str(error)) \
.described_as(
f"Error message should communicate that extension will not be processed, but actual error "
f"was: {error}").is_true()
log.info("Goal state processing for {0} failed as expected".format(t.extension.__str__()))

#
# Validate the agent did not process the extension by checking it did not execute the extension settings
#
output = ssh_client.run_command("dir /tmp", use_sudo=True)
assert_that(output) \
.described_as(
f"Contents of '/tmp' on test VM contains {test_file}. Contents: {output}. \n This indicates "
f"{t.extension.__str__()} was unexpectedly processed") \
.does_not_contain(f"{test_file}")
log.info("The agent did not process the extension settings for {0} as expected".format(t.extension.__str__()))

#
# Validate that the agent continued reporting status even if it is not processing extensions
#
vm: VirtualMachineClient = VirtualMachineClient(self._context.vm)
log.info("")
instance_view: VirtualMachineInstanceView = vm.get_instance_view()
log.info("Instance view of VM Agent:\n%s", instance_view.vm_agent.serialize())
assert_that(instance_view.vm_agent.statuses).described_as("The VM agent should have exactly 1 status").is_length(1)
Expand All @@ -82,10 +121,23 @@ def run(self):
.is_greater_than(pytz.utc.localize(disabled_timestamp))
log.info("The VM Agent reported status after extensions were disabled, as expected.")

def get_ignore_error_rules(self) -> List[Dict[str, Any]]:
return [
{'message': 'No handler status found for Microsoft.Azure.Extensions.CustomScript'},
]
#
# Validate that the agent processes extensions after re-enabling extension processing
#
log.info("")
log.info("Enabling extension processing on the test VM [%s]", self._context.vm.name)
output = ssh_client.run_command("update-waagent-conf Extensions.Enabled=y", use_sudo=True)
log.info("Enable completed:\n%s", output)

for t in test_cases:
try:
log.info("")
log.info("Executing {0}; the agent should process the extension".format(t.extension.__str__()))
t.extension.enable(settings=t.settings, force_update=True, timeout=15 * 60)
log.info("Goal state processing for {0} succeeded as expected".format(t.extension.__str__()))
except Exception as error:
fail(f"Unexpected error while processing {t.extension.__str__()} after re-enabling extension "
f"processing: {error}")


if __name__ == "__main__":
Expand Down

0 comments on commit fc394af

Please sign in to comment.