Skip to content

Commit

Permalink
Merge branch 'develop' into recover_primary_nic
Browse files Browse the repository at this point in the history
  • Loading branch information
maddieford committed Feb 3, 2024
2 parents b145881 + 20f0670 commit 1a2e888
Show file tree
Hide file tree
Showing 17 changed files with 482 additions and 64 deletions.
24 changes: 24 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -261,6 +261,30 @@ without the agent. In order to do that, the `provisionVMAgent` flag must be set
provisioning time, via whichever API is being used. We will provide more details on
this on our wiki when it is generally available.

#### __Extensions.WaitForCloudInit__

_Type: Boolean_
_Default: n_

Waits for cloud-init to complete (cloud-init status --wait) before executing VM extensions.

Both cloud-init and VM extensions are common ways to customize a VM during initial deployment. By
default, the agent will start executing extensions while cloud-init may still be in the 'config'
stage and won't wait for the 'final' stage to complete. Cloud-init and extensions may execute operations
that conflict with each other (for example, both of them may try to install packages). Setting this option
to 'y' ensures that VM extensions are executed only after cloud-init has completed all its stages.

Note that using this option requires creating a custom image with the value of this option set to 'y', in
order to ensure that the wait is performed during the initial deployment of the VM.

#### __Extensions.WaitForCloudInitTimeout__

_Type: Integer_
_Default: 3600_

Timeout in seconds for the Agent to wait on cloud-init. If the timeout elapses, the Agent will continue
executing VM extensions. See Extensions.WaitForCloudInit for more details.

#### __Extensions.GoalStatePeriod__

_Type: Integer_
Expand Down
10 changes: 10 additions & 0 deletions azurelinuxagent/common/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,7 @@ def load_conf_from_file(conf_file_path, conf=__conf__):
"Logs.Console": True,
"Logs.Collect": True,
"Extensions.Enabled": True,
"Extensions.WaitForCloudInit": False,
"Provisioning.AllowResetSysUser": False,
"Provisioning.RegenerateSshHostKeyPair": False,
"Provisioning.DeleteRootPassword": False,
Expand Down Expand Up @@ -170,6 +171,7 @@ def load_conf_from_file(conf_file_path, conf=__conf__):
__INTEGER_OPTIONS__ = {
"Extensions.GoalStatePeriod": 6,
"Extensions.InitialGoalStatePeriod": 6,
"Extensions.WaitForCloudInitTimeout": 3600,
"OS.EnableFirewallPeriod": 300,
"OS.RemovePersistentNetRulesPeriod": 30,
"OS.RootDeviceScsiTimeoutPeriod": 30,
Expand Down Expand Up @@ -372,6 +374,14 @@ def get_extensions_enabled(conf=__conf__):
return conf.get_switch("Extensions.Enabled", True)


def get_wait_for_cloud_init(conf=__conf__):
return conf.get_switch("Extensions.WaitForCloudInit", False)


def get_wait_for_cloud_init_timeout(conf=__conf__):
return conf.get_switch("Extensions.WaitForCloudInitTimeout", 3600)


def get_goal_state_period(conf=__conf__):
return conf.get_int("Extensions.GoalStatePeriod", 6)

Expand Down
1 change: 1 addition & 0 deletions azurelinuxagent/common/event.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,7 @@ class WALAEventOperation:
CGroupsCleanUp = "CGroupsCleanUp"
CGroupsDisabled = "CGroupsDisabled"
CGroupsInfo = "CGroupsInfo"
CloudInit = "CloudInit"
CollectEventErrors = "CollectEventErrors"
CollectEventUnicodeErrors = "CollectEventUnicodeErrors"
ConfigurationChange = "ConfigurationChange"
Expand Down
7 changes: 7 additions & 0 deletions azurelinuxagent/common/osutil/redhat.py
Original file line number Diff line number Diff line change
Expand Up @@ -265,3 +265,10 @@ def restart_if(self, ifname, retries=3, wait=5):
def check_and_recover_nic_state(self, ifname):
# TODO: Implement and test a way to recover the network interface for RedhatOSModernUtil
pass

def publish_hostname(self, hostname):
# RedhatOSUtil was updated to conditionally run NetworkManager restart in response to a race condition between
# NetworkManager restart and the agent restarting the network interface during publish_hostname. Keeping the
# NetworkManager restart in RedhatOSModernUtil because the issue was not reproduced on these versions.
shellutil.run("service NetworkManager restart")
DefaultOSUtil.publish_hostname(self, hostname)
39 changes: 36 additions & 3 deletions azurelinuxagent/common/utils/shellutil.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,17 @@
#
import os
import subprocess
import sys
import tempfile
import threading

if sys.version_info[0] == 2:
# TimeoutExpired was introduced on Python 3; define a dummy class for Python 2
class TimeoutExpired(Exception):
pass
else:
from subprocess import TimeoutExpired

import azurelinuxagent.common.logger as logger
from azurelinuxagent.common.future import ustr

Expand Down Expand Up @@ -206,7 +214,7 @@ def __run_command(command_action, command, log_error, encode_output):


# W0622: Redefining built-in 'input' -- disabled: the parameter name mimics subprocess.communicate()
def run_command(command, input=None, stdin=None, stdout=subprocess.PIPE, stderr=subprocess.PIPE, log_error=False, encode_input=True, encode_output=True, track_process=True): # pylint:disable=W0622
def run_command(command, input=None, stdin=None, stdout=subprocess.PIPE, stderr=subprocess.PIPE, log_error=False, encode_input=True, encode_output=True, track_process=True, timeout=None): # pylint:disable=W0622
"""
Executes the given command and returns its stdout.
Expand All @@ -227,7 +235,9 @@ def run_command(command, input=None, stdin=None, stdout=subprocess.PIPE, stderr=
value for these parameters is anything other than the default (subprocess.PIPE)), then the corresponding
values returned by this function or the CommandError exception will be empty strings.
Note: This is the preferred method to execute shell commands over `azurelinuxagent.common.utils.shellutil.run` function.
NOTE: The 'timeout' parameter is ignored on Python 2
NOTE: This is the preferred method to execute shell commands over `azurelinuxagent.common.utils.shellutil.run` function.
"""
if input is not None and stdin is not None:
raise ValueError("The input and stdin arguments are mutually exclusive")
Expand All @@ -246,7 +256,30 @@ def command_action():
else:
process = subprocess.Popen(command, stdin=popen_stdin, stdout=stdout, stderr=stderr, shell=False)

command_stdout, command_stderr = process.communicate(input=communicate_input)
try:
if sys.version_info[0] == 2: # communicate() doesn't support timeout on Python 2
command_stdout, command_stderr = process.communicate(input=communicate_input)
else:
command_stdout, command_stderr = process.communicate(input=communicate_input, timeout=timeout)
except TimeoutExpired:
if log_error:
logger.error(u"Command [{0}] timed out", __format_command(command))

command_stdout, command_stderr = '', ''

try:
process.kill()
# try to get any output from the command, but ignore any errors if we can't
try:
command_stdout, command_stderr = process.communicate()
# W0702: No exception type(s) specified (bare-except)
except: # pylint: disable=W0702
pass
except Exception as exception:
if log_error:
logger.error(u"Can't terminate timed out process: {0}", ustr(exception))
raise CommandError(command=__format_command(command), return_code=-1, stdout=command_stdout, stderr="command timeout\n{0}".format(command_stderr))

if track_process:
_on_command_completed(process.pid)

Expand Down
20 changes: 20 additions & 0 deletions azurelinuxagent/ga/update.py
Original file line number Diff line number Diff line change
Expand Up @@ -149,6 +149,8 @@ def __init__(self):
self._last_check_memory_usage_time = time.time()
self._check_memory_usage_last_error_report = datetime.min

self._cloud_init_completed = False # Only used when Extensions.WaitForCloudInit is enabled; note that this variable is always reset on service start.

# VM Size is reported via the heartbeat, default it here.
self._vm_size = None

Expand Down Expand Up @@ -458,6 +460,22 @@ def _initialize_goal_state(self, protocol):
logger.info("The current Fabric goal state is older than the most recent FastTrack goal state; will skip it.\nFabric: {0}\nFastTrack: {1}",
egs.created_on_timestamp, last_fast_track_timestamp)

def _wait_for_cloud_init(self):
if conf.get_wait_for_cloud_init() and not self._cloud_init_completed:
message = "Waiting for cloud-init to complete..."
logger.info(message)
add_event(op=WALAEventOperation.CloudInit, message=message)
try:
output = shellutil.run_command(["cloud-init", "status", "--wait"], timeout=conf.get_wait_for_cloud_init_timeout())
message = "cloud-init completed\n{0}".format(output)
logger.info(message)
add_event(op=WALAEventOperation.CloudInit, message=message)
except Exception as e:
message = "An error occurred while waiting for cloud-init; will proceed to execute VM extensions. Extensions that have conflicts with cloud-init may fail.\n{0}".format(ustr(e))
logger.error(message)
add_event(op=WALAEventOperation.CloudInit, message=message, is_success=False, log_event=False)
self._cloud_init_completed = True # Mark as completed even on error since we will proceed to execute extensions

def _get_vm_size(self, protocol):
"""
Including VMSize is meant to capture the architecture of the VM (i.e. arm64 VMs will
Expand Down Expand Up @@ -562,6 +580,8 @@ def _process_goal_state(self, exthandlers_handler, remote_access_handler, agent_
# check for agent updates
agent_update_handler.run(self._goal_state, self._processing_new_extensions_goal_state())

self._wait_for_cloud_init()

try:
if self._processing_new_extensions_goal_state():
if not self._extensions_summary.converged:
Expand Down
2 changes: 2 additions & 0 deletions tests/common/test_conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,8 @@ class TestConf(AgentTestCase):
# -- These values *MUST* match those from data/test_waagent.conf
EXPECTED_CONFIGURATION = {
"Extensions.Enabled": True,
"Extensions.WaitForCloudInit": False,
"Extensions.WaitForCloudInitTimeout": 3600,
"Provisioning.Agent": "auto",
"Provisioning.DeleteRootPassword": True,
"Provisioning.RegenerateSshHostKeyPair": True,
Expand Down
9 changes: 8 additions & 1 deletion tests/common/utils/test_shell_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,13 +18,14 @@
import os
import signal
import subprocess
import sys
import tempfile
import threading
import unittest

from azurelinuxagent.common.future import ustr
import azurelinuxagent.common.utils.shellutil as shellutil
from tests.lib.tools import AgentTestCase, patch
from tests.lib.tools import AgentTestCase, patch, skip_if_predicate_true
from tests.lib.miscellaneous_tools import wait_for, format_processes


Expand Down Expand Up @@ -225,6 +226,12 @@ def test_run_command_should_raise_an_exception_when_it_cannot_execute_the_comman
self.__it_should_raise_an_exception_when_it_cannot_execute_the_command(
lambda: shellutil.run_command("nonexistent_command"))

@skip_if_predicate_true(lambda: sys.version_info[0] == 2, "Timeouts are not supported on Python 2")
def test_run_command_should_raise_an_exception_when_the_command_times_out(self):
with self.assertRaises(shellutil.CommandError) as context:
shellutil.run_command(["sleep", "5"], timeout=1)
self.assertIn("command timeout", context.exception.stderr, "The command did not time out")

def test_run_pipe_should_raise_an_exception_when_it_cannot_execute_the_pipe(self):
self.__it_should_raise_an_exception_when_it_cannot_execute_the_command(
lambda: shellutil.run_pipe([["ls", "-ld", "."], ["nonexistent_command"], ["wc", "-l"]]))
Expand Down
61 changes: 59 additions & 2 deletions tests/ga/test_update.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@
ExtHandlerPackage, ExtHandlerPackageList, Extension, VMStatus, ExtHandlerStatus, ExtensionStatus, \
VMAgentUpdateStatuses
from azurelinuxagent.common.protocol.util import ProtocolUtil
from azurelinuxagent.common.utils import fileutil, textutil, timeutil
from azurelinuxagent.common.utils import fileutil, textutil, timeutil, shellutil
from azurelinuxagent.common.utils.archive import ARCHIVE_DIRECTORY_NAME, AGENT_STATUS_FILE
from azurelinuxagent.common.utils.flexible_version import FlexibleVersion
from azurelinuxagent.common.utils.networkutil import FirewallCmdDirectCommands, AddFirewallRules
Expand Down Expand Up @@ -980,7 +980,6 @@ def match_expected_info():
match_unexpected_errors() # Match on errors first, they can provide more info.
match_expected_info()


def test_it_should_recreate_handler_env_on_service_startup(self):
iterations = 5

Expand Down Expand Up @@ -1361,6 +1360,64 @@ def test_it_should_reset_legacy_blacklisted_agents_on_process_start(self):
self.assertFalse(agent.is_blacklisted, "Legacy Agent should not be blacklisted")


class TestUpdateWaitForCloudInit(AgentTestCase):
@staticmethod
@contextlib.contextmanager
def create_mock_run_command(delay=None):
def run_command_mock(cmd, *args, **kwargs):
if cmd == ["cloud-init", "status", "--wait"]:
if delay is not None:
original_run_command(['sleep', str(delay)], *args, **kwargs)
return "cloud-init completed"
return original_run_command(cmd, *args, **kwargs)
original_run_command = shellutil.run_command

with patch("azurelinuxagent.ga.update.shellutil.run_command", side_effect=run_command_mock) as run_command_patch:
yield run_command_patch

def test_it_should_not_wait_for_cloud_init_by_default(self):
update_handler = UpdateHandler()
with self.create_mock_run_command() as run_command_patch:
update_handler._wait_for_cloud_init()
self.assertTrue(run_command_patch.call_count == 0, "'cloud-init status --wait' should not be called by default")

def test_it_should_wait_for_cloud_init_when_requested(self):
update_handler = UpdateHandler()
with patch("azurelinuxagent.ga.update.conf.get_wait_for_cloud_init", return_value=True):
with self.create_mock_run_command() as run_command_patch:
update_handler._wait_for_cloud_init()
self.assertEqual(1, run_command_patch.call_count, "'cloud-init status --wait' should have be called once")

@skip_if_predicate_true(lambda: sys.version_info[0] == 2, "Timeouts are not supported on Python 2")
def test_it_should_enforce_timeout_waiting_for_cloud_init(self):
update_handler = UpdateHandler()
with patch("azurelinuxagent.ga.update.conf.get_wait_for_cloud_init", return_value=True):
with patch("azurelinuxagent.ga.update.conf.get_wait_for_cloud_init_timeout", return_value=1):
with self.create_mock_run_command(delay=5):
with patch("azurelinuxagent.ga.update.logger.error") as mock_logger:
update_handler._wait_for_cloud_init()
call_args = [args for args, _ in mock_logger.call_args_list if "An error occurred while waiting for cloud-init" in args[0]]
self.assertTrue(
len(call_args) == 1 and len(call_args[0]) == 1 and "command timeout" in call_args[0][0],
"Expected a timeout waiting for cloud-init. Log calls: {0}".format(mock_logger.call_args_list))

def test_update_handler_should_wait_for_cloud_init_after_agent_update_and_before_extension_processing(self):
method_calls = []

agent_update_handler = Mock()
agent_update_handler.run = lambda *_, **__: method_calls.append("AgentUpdateHandler.run()")

exthandlers_handler = Mock()
exthandlers_handler.run = lambda *_, **__: method_calls.append("ExtHandlersHandler.run()")

with mock_wire_protocol(DATA_FILE) as protocol:
with mock_update_handler(protocol, iterations=1, agent_update_handler=agent_update_handler, exthandlers_handler=exthandlers_handler) as update_handler:
with patch('azurelinuxagent.ga.update.UpdateHandler._wait_for_cloud_init', side_effect=lambda *_, **__: method_calls.append("UpdateHandler._wait_for_cloud_init()")):
update_handler.run()

self.assertListEqual(["AgentUpdateHandler.run()", "UpdateHandler._wait_for_cloud_init()", "ExtHandlersHandler.run()"], method_calls, "Wait for cloud-init should happen after agent update and before extension processing")


class UpdateHandlerRunTestCase(AgentTestCase):
def _test_run(self, autoupdate_enabled=False, check_daemon_running=False, expected_exit_code=0, emit_restart_event=None):
fileutil.write_file(conf.get_agent_pid_file_path(), ustr(42))
Expand Down
4 changes: 2 additions & 2 deletions tests/lib/mock_update_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,9 +86,9 @@ def patch_object(target, attribute):

try:
with patch("azurelinuxagent.ga.exthandlers.get_exthandlers_handler", return_value=exthandlers_handler):
with patch("azurelinuxagent.ga.agent_update_handler.get_agent_update_handler", return_value=agent_update_handler):
with patch("azurelinuxagent.ga.update.get_agent_update_handler", return_value=agent_update_handler):
with patch("azurelinuxagent.ga.remoteaccess.get_remote_access_handler", return_value=remote_access_handler):
with patch("azurelinuxagent.common.conf.get_autoupdate_enabled", return_value=autoupdate_enabled):
with patch("azurelinuxagent.ga.update.conf.get_autoupdate_enabled", return_value=autoupdate_enabled):
with patch.object(UpdateHandler, "is_running", PropertyMock(side_effect=is_running)):
with patch('azurelinuxagent.ga.update.time.sleep', side_effect=lambda _: mock_sleep(0.001)) as sleep:
with patch('sys.exit', side_effect=lambda _: 0) as mock_exit:
Expand Down
2 changes: 2 additions & 0 deletions tests/test_agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,8 @@
Extensions.Enabled = True
Extensions.GoalStatePeriod = 6
Extensions.InitialGoalStatePeriod = 6
Extensions.WaitForCloudInit = False
Extensions.WaitForCloudInitTimeout = 3600
HttpProxy.Host = None
HttpProxy.Port = None
Lib.Dir = /var/lib/waagent
Expand Down
Loading

0 comments on commit 1a2e888

Please sign in to comment.