Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add config parameter to wait for cloud-init (Extensions.WaitForCloudInit) #3031

Merged
merged 6 commits into from
Feb 1, 2024
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 24 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -261,6 +261,30 @@ without the agent. In order to do that, the `provisionVMAgent` flag must be set
provisioning time, via whichever API is being used. We will provide more details on
this on our wiki when it is generally available.

#### __Extensions.WaitForCloudInit__

_Type: Boolean_
_Default: n_

Waits for cloud-init to complete (cloud-init status --wait) before executing VM extensions.

Both cloud-init and VM extensions are common ways to customize a VM during initial deployment. By
default, the agent will start executing extensions while cloud-init may still be in the 'config'
stage and won't wait for the 'final' stage to complete. Cloud-init and extensions may execute operations
that conflict with each other (for example, both of them may try to install packages). Setting this option
to 'y' ensures that VM extensions are executed only after cloud-init has completed all its stages.

Note that using this option requires creating a custom image with the value of this option set to 'y', in
order to ensure that the wait is performed during the initial deployment of the VM.

#### __Extensions.WaitForCloudInitTimeout__

_Type: Integer_
_Default: 3600_
nagworld9 marked this conversation as resolved.
Show resolved Hide resolved

Timeout in seconds for the Agent to wait on cloud-init. If the timeout elapses, the Agent will continue
executing VM extensions. See Extensions.WaitForCloudInit for more details.

#### __Extensions.GoalStatePeriod__

_Type: Integer_
Expand Down
10 changes: 10 additions & 0 deletions azurelinuxagent/common/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,7 @@ def load_conf_from_file(conf_file_path, conf=__conf__):
"Logs.Console": True,
"Logs.Collect": True,
"Extensions.Enabled": True,
"Extensions.WaitForCloudInit": False,
"Provisioning.AllowResetSysUser": False,
"Provisioning.RegenerateSshHostKeyPair": False,
"Provisioning.DeleteRootPassword": False,
Expand Down Expand Up @@ -170,6 +171,7 @@ def load_conf_from_file(conf_file_path, conf=__conf__):
__INTEGER_OPTIONS__ = {
"Extensions.GoalStatePeriod": 6,
"Extensions.InitialGoalStatePeriod": 6,
"Extensions.WaitForCloudInitTimeout": 3600,
"OS.EnableFirewallPeriod": 300,
"OS.RemovePersistentNetRulesPeriod": 30,
"OS.RootDeviceScsiTimeoutPeriod": 30,
Expand Down Expand Up @@ -372,6 +374,14 @@ def get_extensions_enabled(conf=__conf__):
return conf.get_switch("Extensions.Enabled", True)


def get_wait_for_cloud_init(conf=__conf__):
return conf.get_switch("Extensions.WaitForCloudInit", False)


def get_wait_for_cloud_init_timeout(conf=__conf__):
return conf.get_switch("Extensions.WaitForCloudInitTimeout", 3600)


def get_goal_state_period(conf=__conf__):
return conf.get_int("Extensions.GoalStatePeriod", 6)

Expand Down
1 change: 1 addition & 0 deletions azurelinuxagent/common/event.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,7 @@ class WALAEventOperation:
CGroupsCleanUp = "CGroupsCleanUp"
CGroupsDisabled = "CGroupsDisabled"
CGroupsInfo = "CGroupsInfo"
CloudInit = "CloudInit"
CollectEventErrors = "CollectEventErrors"
CollectEventUnicodeErrors = "CollectEventUnicodeErrors"
ConfigurationChange = "ConfigurationChange"
Expand Down
39 changes: 36 additions & 3 deletions azurelinuxagent/common/utils/shellutil.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,17 @@
#
import os
import subprocess
import sys
import tempfile
import threading

if sys.version_info[0] == 2:
# TimeoutExpired was introduced on Python 3; define a dummy class for Python 2
class TimeoutExpired(Exception):
pass
else:
from subprocess import TimeoutExpired

import azurelinuxagent.common.logger as logger
from azurelinuxagent.common.future import ustr

Expand Down Expand Up @@ -206,7 +214,7 @@ def __run_command(command_action, command, log_error, encode_output):


# W0622: Redefining built-in 'input' -- disabled: the parameter name mimics subprocess.communicate()
def run_command(command, input=None, stdin=None, stdout=subprocess.PIPE, stderr=subprocess.PIPE, log_error=False, encode_input=True, encode_output=True, track_process=True): # pylint:disable=W0622
def run_command(command, input=None, stdin=None, stdout=subprocess.PIPE, stderr=subprocess.PIPE, log_error=False, encode_input=True, encode_output=True, track_process=True, timeout=None): # pylint:disable=W0622
"""
Executes the given command and returns its stdout.

Expand All @@ -227,7 +235,9 @@ def run_command(command, input=None, stdin=None, stdout=subprocess.PIPE, stderr=
value for these parameters is anything other than the default (subprocess.PIPE)), then the corresponding
values returned by this function or the CommandError exception will be empty strings.

Note: This is the preferred method to execute shell commands over `azurelinuxagent.common.utils.shellutil.run` function.
NOTE: The 'timeout' parameter is ignored on Python 2

NOTE: This is the preferred method to execute shell commands over `azurelinuxagent.common.utils.shellutil.run` function.
"""
if input is not None and stdin is not None:
raise ValueError("The input and stdin arguments are mutually exclusive")
Expand All @@ -246,7 +256,30 @@ def command_action():
else:
process = subprocess.Popen(command, stdin=popen_stdin, stdout=stdout, stderr=stderr, shell=False)

command_stdout, command_stderr = process.communicate(input=communicate_input)
try:
if sys.version_info[0] == 2: # communicate() doesn't support timeout on Python 2
command_stdout, command_stderr = process.communicate(input=communicate_input)
else:
command_stdout, command_stderr = process.communicate(input=communicate_input, timeout=timeout)
except TimeoutExpired:
if log_error:
logger.error(u"Command [{0}] timed out", __format_command(command))

command_stdout, command_stderr = '', ''

try:
process.kill()
# try to get any output from the command, but ignore any errors if we can't
try:
command_stdout, command_stderr = process.communicate()
# W0702: No exception type(s) specified (bare-except)
except: # pylint: disable=W0702
pass
except Exception as exception:
if log_error:
logger.error(u"Can't terminate timed out process: {0}", ustr(exception))
raise CommandError(command=__format_command(command), return_code=-1, stdout=command_stdout, stderr="command timeout\n{0}".format(command_stderr))
nagworld9 marked this conversation as resolved.
Show resolved Hide resolved

if track_process:
_on_command_completed(process.pid)

Expand Down
20 changes: 20 additions & 0 deletions azurelinuxagent/ga/update.py
Original file line number Diff line number Diff line change
Expand Up @@ -149,6 +149,8 @@ def __init__(self):
self._last_check_memory_usage_time = time.time()
self._check_memory_usage_last_error_report = datetime.min

self._cloud_init_completed = False # Only used when Extensions.WaitForCloudInit is enabled; note that this variable is always reset on service start.
nagworld9 marked this conversation as resolved.
Show resolved Hide resolved

# VM Size is reported via the heartbeat, default it here.
self._vm_size = None

Expand Down Expand Up @@ -458,6 +460,22 @@ def _initialize_goal_state(self, protocol):
logger.info("The current Fabric goal state is older than the most recent FastTrack goal state; will skip it.\nFabric: {0}\nFastTrack: {1}",
egs.created_on_timestamp, last_fast_track_timestamp)

def _wait_for_cloud_init(self):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

On my PR for updating the daemon to clean up any stale files if VM image was not properly generalized, we discussed the possibility that agent and cloud-init will be racing (since the agent will detect that the vm is in a 'provisioned' state when it is not).

Could we add a call to _wait_for_cloud_init() in the daemon to protect against this?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The race I was mentioning is against the Config stage, that is where the Azure module does the provisioning stuff. The wait I am adding is against the Final stage. PA does not wait for Final because there could be lengthy operations there that could affect TDPR (that is why the option I added needs to be explicitly set on a custom image).

Waiting for Config when 'provisioned' exists may be a good idea, but I would leave the decision of whether to implement it to PA.

https://cloudinit.readthedocs.io/en/latest/explanation/boot.html

There are five stages to boot:

1 Detect
2 Local
3 Network
4 Config
5 Final

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

makes sense

if conf.get_wait_for_cloud_init() and not self._cloud_init_completed:
message = "Waiting for cloud-init to complete..."
logger.info(message)
add_event(op=WALAEventOperation.CloudInit, message=message)
try:
output = shellutil.run_command(["cloud-init", "status", "--wait"], timeout=conf.get_wait_for_cloud_init_timeout())
nagworld9 marked this conversation as resolved.
Show resolved Hide resolved
nagworld9 marked this conversation as resolved.
Show resolved Hide resolved
message = "cloud-init completed\n{0}".format(output)
logger.info(message)
add_event(op=WALAEventOperation.CloudInit, message=message)
except Exception as e:
message = "An error occurred while waiting for cloud-init; will proceed to execute VM extensions. Extensions that have conflicts with cloud-init may fail.\n{0}".format(ustr(e))
logger.error(message)
add_event(op=WALAEventOperation.CloudInit, message=message, is_success=False, log_event=False)
self._cloud_init_completed = True # Mark as completed even on error since we will proceed to execute extensions

def _get_vm_size(self, protocol):
"""
Including VMSize is meant to capture the architecture of the VM (i.e. arm64 VMs will
Expand Down Expand Up @@ -562,6 +580,8 @@ def _process_goal_state(self, exthandlers_handler, remote_access_handler, agent_
# check for agent updates
agent_update_handler.run(self._goal_state, self._processing_new_extensions_goal_state())

self._wait_for_cloud_init()
nagworld9 marked this conversation as resolved.
Show resolved Hide resolved

try:
if self._processing_new_extensions_goal_state():
if not self._extensions_summary.converged:
Expand Down
2 changes: 2 additions & 0 deletions tests/common/test_conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,8 @@ class TestConf(AgentTestCase):
# -- These values *MUST* match those from data/test_waagent.conf
EXPECTED_CONFIGURATION = {
"Extensions.Enabled": True,
"Extensions.WaitForCloudInit": False,
"Extensions.WaitForCloudInitTimeout": 3600,
"Provisioning.Agent": "auto",
"Provisioning.DeleteRootPassword": True,
"Provisioning.RegenerateSshHostKeyPair": True,
Expand Down
9 changes: 8 additions & 1 deletion tests/common/utils/test_shell_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,13 +18,14 @@
import os
import signal
import subprocess
import sys
import tempfile
import threading
import unittest

from azurelinuxagent.common.future import ustr
import azurelinuxagent.common.utils.shellutil as shellutil
from tests.lib.tools import AgentTestCase, patch
from tests.lib.tools import AgentTestCase, patch, skip_if_predicate_true
from tests.lib.miscellaneous_tools import wait_for, format_processes


Expand Down Expand Up @@ -225,6 +226,12 @@ def test_run_command_should_raise_an_exception_when_it_cannot_execute_the_comman
self.__it_should_raise_an_exception_when_it_cannot_execute_the_command(
lambda: shellutil.run_command("nonexistent_command"))

@skip_if_predicate_true(lambda: sys.version_info[0] == 2, "Timeouts are not supported on Python 2")
def test_run_command_should_raise_an_exception_when_the_command_times_out(self):
with self.assertRaises(shellutil.CommandError) as context:
shellutil.run_command(["sleep", "5"], timeout=1)
self.assertIn("command timeout", context.exception.stderr, "The command did not time out")

def test_run_pipe_should_raise_an_exception_when_it_cannot_execute_the_pipe(self):
self.__it_should_raise_an_exception_when_it_cannot_execute_the_command(
lambda: shellutil.run_pipe([["ls", "-ld", "."], ["nonexistent_command"], ["wc", "-l"]]))
Expand Down
61 changes: 59 additions & 2 deletions tests/ga/test_update.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@
ExtHandlerPackage, ExtHandlerPackageList, Extension, VMStatus, ExtHandlerStatus, ExtensionStatus, \
VMAgentUpdateStatuses
from azurelinuxagent.common.protocol.util import ProtocolUtil
from azurelinuxagent.common.utils import fileutil, textutil, timeutil
from azurelinuxagent.common.utils import fileutil, textutil, timeutil, shellutil
from azurelinuxagent.common.utils.archive import ARCHIVE_DIRECTORY_NAME, AGENT_STATUS_FILE
from azurelinuxagent.common.utils.flexible_version import FlexibleVersion
from azurelinuxagent.common.utils.networkutil import FirewallCmdDirectCommands, AddFirewallRules
Expand Down Expand Up @@ -980,7 +980,6 @@ def match_expected_info():
match_unexpected_errors() # Match on errors first, they can provide more info.
match_expected_info()


def test_it_should_recreate_handler_env_on_service_startup(self):
iterations = 5

Expand Down Expand Up @@ -1361,6 +1360,64 @@ def test_it_should_reset_legacy_blacklisted_agents_on_process_start(self):
self.assertFalse(agent.is_blacklisted, "Legacy Agent should not be blacklisted")


class TestUpdateWaitForCloudInit(AgentTestCase):
@staticmethod
@contextlib.contextmanager
def create_mock_run_command(delay=None):
def run_command_mock(cmd, *args, **kwargs):
if cmd == ["cloud-init", "status", "--wait"]:
if delay is not None:
original_run_command(['sleep', str(delay)], *args, **kwargs)
return "cloud-init completed"
return original_run_command(cmd, *args, **kwargs)
original_run_command = shellutil.run_command

with patch("azurelinuxagent.ga.update.shellutil.run_command", side_effect=run_command_mock) as run_command_patch:
yield run_command_patch

def test_it_should_not_wait_for_cloud_init_by_default(self):
update_handler = UpdateHandler()
with self.create_mock_run_command() as run_command_patch:
update_handler._wait_for_cloud_init()
self.assertTrue(run_command_patch.call_count == 0, "'cloud-init status --wait' should not be called by default")

def test_it_should_wait_for_cloud_init_when_requested(self):
update_handler = UpdateHandler()
with patch("azurelinuxagent.ga.update.conf.get_wait_for_cloud_init", return_value=True):
with self.create_mock_run_command() as run_command_patch:
update_handler._wait_for_cloud_init()
self.assertEqual(1, run_command_patch.call_count, "'cloud-init status --wait' should have be called once")

@skip_if_predicate_true(lambda: sys.version_info[0] == 2, "Timeouts are not supported on Python 2")
def test_it_should_enforce_timeout_waiting_for_cloud_init(self):
update_handler = UpdateHandler()
with patch("azurelinuxagent.ga.update.conf.get_wait_for_cloud_init", return_value=True):
with patch("azurelinuxagent.ga.update.conf.get_wait_for_cloud_init_timeout", return_value=1):
with self.create_mock_run_command(delay=5):
with patch("azurelinuxagent.ga.update.logger.error") as mock_logger:
update_handler._wait_for_cloud_init()
call_args = [args for args, _ in mock_logger.call_args_list if "An error occurred while waiting for cloud-init" in args[0]]
self.assertTrue(
len(call_args) == 1 and len(call_args[0]) == 1 and "command timeout" in call_args[0][0],
"Expected a timeout waiting for cloud-init. Log calls: {0}".format(mock_logger.call_args_list))

def test_update_handler_should_wait_for_cloud_init_after_agent_update_and_before_extension_processing(self):
method_calls = []

agent_update_handler = Mock()
agent_update_handler.run = lambda *_, **__: method_calls.append("AgentUpdateHandler.run()")

exthandlers_handler = Mock()
exthandlers_handler.run = lambda *_, **__: method_calls.append("ExtHandlersHandler.run()")

with mock_wire_protocol(DATA_FILE) as protocol:
with mock_update_handler(protocol, iterations=1, agent_update_handler=agent_update_handler, exthandlers_handler=exthandlers_handler) as update_handler:
with patch('azurelinuxagent.ga.update.UpdateHandler._wait_for_cloud_init', side_effect=lambda *_, **__: method_calls.append("UpdateHandler._wait_for_cloud_init()")):
update_handler.run()

self.assertListEqual(["AgentUpdateHandler.run()", "UpdateHandler._wait_for_cloud_init()", "ExtHandlersHandler.run()"], method_calls, "Wait for cloud-init should happen after agent update and before extension processing")


class UpdateHandlerRunTestCase(AgentTestCase):
def _test_run(self, autoupdate_enabled=False, check_daemon_running=False, expected_exit_code=0, emit_restart_event=None):
fileutil.write_file(conf.get_agent_pid_file_path(), ustr(42))
Expand Down
4 changes: 2 additions & 2 deletions tests/lib/mock_update_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,9 +86,9 @@ def patch_object(target, attribute):

try:
with patch("azurelinuxagent.ga.exthandlers.get_exthandlers_handler", return_value=exthandlers_handler):
with patch("azurelinuxagent.ga.agent_update_handler.get_agent_update_handler", return_value=agent_update_handler):
with patch("azurelinuxagent.ga.update.get_agent_update_handler", return_value=agent_update_handler):
with patch("azurelinuxagent.ga.remoteaccess.get_remote_access_handler", return_value=remote_access_handler):
with patch("azurelinuxagent.common.conf.get_autoupdate_enabled", return_value=autoupdate_enabled):
with patch("azurelinuxagent.ga.update.conf.get_autoupdate_enabled", return_value=autoupdate_enabled):
with patch.object(UpdateHandler, "is_running", PropertyMock(side_effect=is_running)):
with patch('azurelinuxagent.ga.update.time.sleep', side_effect=lambda _: mock_sleep(0.001)) as sleep:
with patch('sys.exit', side_effect=lambda _: 0) as mock_exit:
Expand Down
2 changes: 2 additions & 0 deletions tests/test_agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,8 @@
Extensions.Enabled = True
Extensions.GoalStatePeriod = 6
Extensions.InitialGoalStatePeriod = 6
Extensions.WaitForCloudInit = False
Extensions.WaitForCloudInitTimeout = 3600
HttpProxy.Host = None
HttpProxy.Port = None
Lib.Dir = /var/lib/waagent
Expand Down
Loading
Loading