Skip to content

Commit

Permalink
Merge pull request #2884 from Azure/release-2.9.1.0
Browse files Browse the repository at this point in the history
Merge release 2.9.1.0 to master
  • Loading branch information
maddieford committed Jul 27, 2023
2 parents dce0341 + 5aa163f commit 28345a5
Show file tree
Hide file tree
Showing 159 changed files with 5,651 additions and 5,513 deletions.
67 changes: 42 additions & 25 deletions .github/workflows/ci_pr.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,74 +10,91 @@ on:
jobs:
test-legacy-python-versions:

name: "Python 2.6 Unit Tests"
runs-on: ubuntu-18.04

strategy:
fail-fast: false
matrix:
include:
- python-version: 2.6
- python-version: 3.4

name: "Python ${{ matrix.python-version }} Unit Tests"
runs-on: ubuntu-20.04
container:
image: ubuntu:16.04
volumes:
- /home/waagent:/home/waagent
defaults:
run:
shell: bash -l {0}

env:
NOSEOPTS: "--verbose"

steps:
- uses: actions/checkout@v3

- name: Install Python 2.6
- name: Install Python ${{ matrix.python-version }}
run: |
curl https://dcrdata.blob.core.windows.net/python/python-2.6.tar.bz2 -o python-2.6.tar.bz2
sudo tar xjvf python-2.6.tar.bz2 --directory /
- uses: actions/checkout@v2
apt-get update
apt-get install -y curl bzip2 sudo python3
curl https://dcrdata.blob.core.windows.net/python/python-${{ matrix.python-version }}.tar.bz2 -o python-${{ matrix.python-version }}.tar.bz2
sudo tar xjvf python-${{ matrix.python-version }}.tar.bz2 --directory /
- name: Test with nosetests
run: |
source /home/waagent/virtualenv/python2.6.9/bin/activate
if [[ ${{ matrix.python-version }} == 2.6 ]]; then
source /home/waagent/virtualenv/python2.6.9/bin/activate
else
source /home/waagent/virtualenv/python3.4.8/bin/activate
fi
./ci/nosetests.sh
exit $?
test-current-python-versions:

strategy:
fail-fast: false
matrix:
include:

- python-version: 2.7
PYLINTOPTS: "--rcfile=ci/2.7.pylintrc"
PYLINTOPTS: "--rcfile=ci/2.7.pylintrc --ignore=tests_e2e,makepkg.py"

- python-version: 3.4
PYLINTOPTS: "--rcfile=ci/2.7.pylintrc"
- python-version: 3.5
PYLINTOPTS: "--rcfile=ci/3.6.pylintrc --ignore=tests_e2e,makepkg.py"

- python-version: 3.6
PYLINTOPTS: "--rcfile=ci/3.6.pylintrc"
PYLINTOPTS: "--rcfile=ci/3.6.pylintrc --ignore=tests_e2e"

- python-version: 3.7
PYLINTOPTS: "--rcfile=ci/3.6.pylintrc"
PYLINTOPTS: "--rcfile=ci/3.6.pylintrc --ignore=tests_e2e"

- python-version: 3.8
PYLINTOPTS: "--rcfile=ci/3.6.pylintrc"
PYLINTOPTS: "--rcfile=ci/3.6.pylintrc --ignore=tests_e2e"

- python-version: 3.9
PYLINTOPTS: "--rcfile=ci/3.6.pylintrc"
additional-nose-opts: "--with-coverage --cover-erase --cover-inclusive --cover-branches --cover-package=azurelinuxagent"

name: "Python ${{ matrix.python-version }} Unit Tests"
runs-on: ubuntu-18.04
runs-on: ubuntu-20.04

env:
PYLINTOPTS: ${{ matrix.PYLINTOPTS }}
PYLINTFILES: "azurelinuxagent setup.py makepkg.py tests"
PYLINTFILES: "azurelinuxagent setup.py makepkg.py tests tests_e2e"
NOSEOPTS: "--with-timer ${{ matrix.additional-nose-opts }}"
PYTHON_VERSION: ${{ matrix.python-version }}

steps:

- name: Checkout WALinuxAgent repo
uses: actions/checkout@v2
uses: actions/checkout@v3

- name: Setup Python ${{ matrix.python-version }}
uses: actions/setup-python@v2
uses: actions/setup-python@v4
with:
python-version: ${{ matrix.python-version }}

- name: Install dependencies
id: install-dependencies
run: |
Expand Down Expand Up @@ -106,6 +123,6 @@ jobs:
- name: Upload Coverage
if: matrix.python-version == 3.9
uses: codecov/codecov-action@v1
uses: codecov/codecov-action@v2
with:
file: ./coverage.xml
2 changes: 0 additions & 2 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,6 @@ develop-eggs/
dist/
downloads/
eggs/
lib/
lib64/
parts/
sdist/
var/
Expand Down
2 changes: 1 addition & 1 deletion CODEOWNERS
Validating CODEOWNERS rules …
Original file line number Diff line number Diff line change
Expand Up @@ -21,4 +21,4 @@
#
# Linux Agent team
#
* @narrieta @ZhidongPeng @nagworld9
* @narrieta @ZhidongPeng @nagworld9 @maddieford
4 changes: 3 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,9 @@ The agent will use an HTTP proxy if provided via the `http_proxy` (for `http` re
`https_proxy` (for `https` requests) environment variables. The `HttpProxy.Host` and
`HttpProxy.Port` configuration variables (see below), if used, will override the environment
settings. Due to limitations of Python, the agent *does not* support HTTP proxies requiring
authentication.
authentication. Note that when the agent service is managed by systemd, environment variables
such as `http_proxy` and `https_proxy` should be defined using one the mechanisms provided by
systemd (e.g. by using Environment or EnvironmentFile in the service file).

## Requirements

Expand Down
3 changes: 1 addition & 2 deletions azurelinuxagent/common/cgroup.py
Original file line number Diff line number Diff line change
Expand Up @@ -360,8 +360,7 @@ def try_swap_memory_usage(self):
except CounterNotFound as e:
if self._counter_not_found_error_count < 1:
logger.periodic_info(logger.EVERY_HALF_HOUR,
'Could not find swap counter from "memory.stat" file in the cgroup: {0}.'
' Internal error: {1}'.format(self.path, ustr(e)))
'{0} from "memory.stat" file in the cgroup: {1}---[Note: This log for informational purpose only and can be ignored]'.format(ustr(e), self.path))
self._counter_not_found_error_count += 1
return 0

Expand Down
7 changes: 6 additions & 1 deletion azurelinuxagent/common/cgroupapi.py
Original file line number Diff line number Diff line change
Expand Up @@ -253,7 +253,12 @@ def _is_systemd_failure(scope_name, stderr):
return unit_not_found in stderr or scope_name not in stderr

@staticmethod
def get_extension_slice_name(extension_name):
def get_extension_slice_name(extension_name, old_slice=False):
# The old slice makes it difficult for user to override the limits because they need to place drop-in files on every upgrade if extension slice is different for each version.
# old slice includes <HandlerName>.<ExtensionName>-<HandlerVersion>
# new slice without version <HandlerName>.<ExtensionName>
if not old_slice:
extension_name = extension_name.rsplit("-", 1)[0]
# Since '-' is used as a separator in systemd unit names, we replace it with '_' to prevent side-effects.
return EXTENSION_SLICE_PREFIX + "-" + extension_name.replace('-', '_') + ".slice"

Expand Down
70 changes: 62 additions & 8 deletions azurelinuxagent/common/cgroupconfigurator.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
from azurelinuxagent.common.cgroup import CpuCgroup, AGENT_NAME_TELEMETRY, MetricsCounter, MemoryCgroup
from azurelinuxagent.common.cgroupapi import CGroupsApi, SystemdCgroupsApi, SystemdRunError, EXTENSION_SLICE_PREFIX
from azurelinuxagent.common.cgroupstelemetry import CGroupsTelemetry
from azurelinuxagent.common.exception import ExtensionErrorCodes, CGroupsException
from azurelinuxagent.common.exception import ExtensionErrorCodes, CGroupsException, AgentMemoryExceededException
from azurelinuxagent.common.future import ustr
from azurelinuxagent.common.osutil import get_osutil, systemd
from azurelinuxagent.common.version import get_distro
Expand Down Expand Up @@ -143,6 +143,7 @@ def __init__(self):
self._cgroups_api = None
self._agent_cpu_cgroup_path = None
self._agent_memory_cgroup_path = None
self._agent_memory_cgroup = None
self._check_cgroups_lock = threading.RLock() # Protect the check_cgroups which is called from Monitor thread and main loop.

def initialize(self):
Expand Down Expand Up @@ -213,7 +214,8 @@ def initialize(self):

if self._agent_memory_cgroup_path is not None:
_log_cgroup_info("Agent Memory cgroup: {0}", self._agent_memory_cgroup_path)
CGroupsTelemetry.track_cgroup(MemoryCgroup(AGENT_NAME_TELEMETRY, self._agent_memory_cgroup_path))
self._agent_memory_cgroup = MemoryCgroup(AGENT_NAME_TELEMETRY, self._agent_memory_cgroup_path)
CGroupsTelemetry.track_cgroup(self._agent_memory_cgroup)

_log_cgroup_info('Agent cgroups enabled: {0}', self._agent_cgroups_enabled)

Expand Down Expand Up @@ -366,10 +368,9 @@ def __setup_azure_slice():
if not os.path.exists(vmextensions_slice):
files_to_create.append((vmextensions_slice, _VMEXTENSIONS_SLICE_CONTENTS))

if not os.path.exists(logcollector_slice):
slice_contents = _LOGCOLLECTOR_SLICE_CONTENTS_FMT.format(cpu_quota=_LOGCOLLECTOR_CPU_QUOTA)

files_to_create.append((logcollector_slice, slice_contents))
# Update log collector slice contents
slice_contents = _LOGCOLLECTOR_SLICE_CONTENTS_FMT.format(cpu_quota=_LOGCOLLECTOR_CPU_QUOTA)
files_to_create.append((logcollector_slice, slice_contents))

if fileutil.findre_in_file(agent_unit_file, r"Slice=") is not None:
CGroupConfigurator._Impl.__cleanup_unit_file(agent_drop_in_file_slice)
Expand Down Expand Up @@ -454,6 +455,11 @@ def __create_all_files(files_to_create):

def is_extension_resource_limits_setup_completed(self, extension_name, cpu_quota=None):
unit_file_install_path = systemd.get_unit_file_install_path()
old_extension_slice_path = os.path.join(unit_file_install_path, SystemdCgroupsApi.get_extension_slice_name(extension_name, old_slice=True))
# clean up the old slice from the disk
if os.path.exists(old_extension_slice_path):
CGroupConfigurator._Impl.__cleanup_unit_file(old_extension_slice_path)

extension_slice_path = os.path.join(unit_file_install_path,
SystemdCgroupsApi.get_extension_slice_name(extension_name))
cpu_quota = str(
Expand Down Expand Up @@ -644,7 +650,7 @@ def _check_processes_in_agent_cgroup(self):
Raises a CGroupsException if the check fails
"""
unexpected = []

agent_cgroup_proc_names = []
try:
daemon = os.getppid()
extension_handler = os.getpid()
Expand All @@ -658,9 +664,13 @@ def _check_processes_in_agent_cgroup(self):
systemd_run_commands.update(self._cgroups_api.get_systemd_run_commands())

for process in agent_cgroup:
agent_cgroup_proc_names.append(self.__format_process(process))
# Note that the agent uses systemd-run to start extensions; systemd-run belongs to the agent cgroup, though the extensions don't.
if process in (daemon, extension_handler) or process in systemd_run_commands:
continue
# check shell systemd_run process if above process check didn't catch it
if self._check_systemd_run_process(process):
continue
# systemd_run_commands contains the shell that started systemd-run, so we also need to check for the parent
if self._get_parent(process) in systemd_run_commands and self._get_command(
process) == 'systemd-run':
Expand All @@ -679,6 +689,7 @@ def _check_processes_in_agent_cgroup(self):
_log_cgroup_warning("Error checking the processes in the agent's cgroup: {0}".format(ustr(exception)))

if len(unexpected) > 0:
self._report_agent_cgroups_procs(agent_cgroup_proc_names, unexpected)
raise CGroupsException("The agent's cgroup includes unexpected processes: {0}".format(unexpected))

@staticmethod
Expand Down Expand Up @@ -741,13 +752,53 @@ def __is_zombie_process(pid):
pass
return False

@staticmethod
def _check_systemd_run_process(process):
"""
Returns True if process is shell systemd-run process started by agent otherwise False.
Ex: sh,7345 -c systemd-run --unit=enable_7c5cab19-eb79-4661-95d9-9e5091bd5ae0 --scope --slice=azure-vmextensions-Microsoft.OSTCExtensions.VMAccessForLinux_1.5.11.slice /var/lib/waagent/Microsoft.OSTCExtensions.VMAccessForLinux-1.5.11/processes.sh
"""
try:
process_name = "UNKNOWN"
cmdline = '/proc/{0}/cmdline'.format(process)
if os.path.exists(cmdline):
with open(cmdline, "r") as cmdline_file:
process_name = "{0}".format(cmdline_file.read())
match = re.search(r'systemd-run.*--unit=.*--scope.*--slice=azure-vmextensions.*', process_name)
if match is not None:
return True
except Exception:
pass
return False

@staticmethod
def _report_agent_cgroups_procs(agent_cgroup_proc_names, unexpected):
for proc_name in unexpected:
if 'UNKNOWN' in proc_name:
msg = "Agent includes following processes when UNKNOWN process found: {0}".format("\n".join([ustr(proc) for proc in agent_cgroup_proc_names]))
add_event(op=WALAEventOperation.CGroupsInfo, message=msg)

@staticmethod
def _check_agent_throttled_time(cgroup_metrics):
for metric in cgroup_metrics:
if metric.instance == AGENT_NAME_TELEMETRY and metric.counter == MetricsCounter.THROTTLED_TIME:
if metric.value > conf.get_agent_cpu_throttled_time_threshold():
raise CGroupsException("The agent has been throttled for {0} seconds".format(metric.value))

def check_agent_memory_usage(self):
if self.enabled() and self._agent_memory_cgroup:
metrics = self._agent_memory_cgroup.get_tracked_metrics()
current_usage = 0
for metric in metrics:
if metric.counter == MetricsCounter.TOTAL_MEM_USAGE:
current_usage += metric.value
elif metric.counter == MetricsCounter.SWAP_MEM_USAGE:
current_usage += metric.value

if current_usage > conf.get_agent_memory_quota():
raise AgentMemoryExceededException("The agent memory limit {0} bytes exceeded. The current reported usage is {1} bytes.".format(conf.get_agent_memory_quota(), current_usage))

@staticmethod
def _get_parent(pid):
"""
Expand Down Expand Up @@ -875,7 +926,10 @@ def setup_extension_slice(self, extension_name, cpu_quota):
SystemdCgroupsApi.get_extension_slice_name(extension_name))
try:
cpu_quota = str(cpu_quota) + "%" if cpu_quota is not None else "" # setting an empty value resets to the default (infinity)
_log_cgroup_info("Ensuring the {0}'s CPUQuota is {1}", extension_name, cpu_quota)
if cpu_quota == "":
_log_cgroup_info("CPUQuota not set for {0}", extension_name)
else:
_log_cgroup_info("Ensuring the {0}'s CPUQuota is {1}", extension_name, cpu_quota)
slice_contents = _EXTENSION_SLICE_CONTENTS.format(extension_name=extension_name,
cpu_quota=cpu_quota)
CGroupConfigurator._Impl.__create_unit_file(extension_slice_path, slice_contents)
Expand Down
20 changes: 20 additions & 0 deletions azurelinuxagent/common/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,6 +136,7 @@ def load_conf_from_file(conf_file_path, conf=__conf__):
"Debug.CgroupLogMetrics": False,
"Debug.CgroupDisableOnProcessCheckFailure": True,
"Debug.CgroupDisableOnQuotaCheckFailure": True,
"Debug.EnableAgentMemoryUsageCheck": False,
"Debug.EnableFastTrack": True,
"Debug.EnableGAVersioning": False
}
Expand Down Expand Up @@ -186,6 +187,7 @@ def load_conf_from_file(conf_file_path, conf=__conf__):
"Debug.CgroupCheckPeriod": 300,
"Debug.AgentCpuQuota": 50,
"Debug.AgentCpuThrottledTimeThreshold": 120,
"Debug.AgentMemoryQuota": 30 * 1024 ** 2,
"Debug.EtpCollectionPeriod": 300,
"Debug.AutoUpdateHotfixFrequency": 14400,
"Debug.AutoUpdateNormalFrequency": 86400,
Expand Down Expand Up @@ -555,6 +557,24 @@ def get_agent_cpu_throttled_time_threshold(conf=__conf__):
return conf.get_int("Debug.AgentCpuThrottledTimeThreshold", 120)


def get_agent_memory_quota(conf=__conf__):
"""
Memory quota for the agent in bytes.
NOTE: This option is experimental and may be removed in later versions of the Agent.
"""
return conf.get_int("Debug.AgentMemoryQuota", 30 * 1024 ** 2)


def get_enable_agent_memory_usage_check(conf=__conf__):
"""
If True, Agent checks it's Memory usage.
NOTE: This option is experimental and may be removed in later versions of the Agent.
"""
return conf.get_switch("Debug.EnableAgentMemoryUsageCheck", False)


def get_cgroup_monitor_expiry_time(conf=__conf__):
"""
cgroups monitoring for pilot extensions disabled after expiry time
Expand Down
1 change: 1 addition & 0 deletions azurelinuxagent/common/event.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,7 @@ class WALAEventOperation:
ActivateResourceDisk = "ActivateResourceDisk"
AgentBlacklisted = "AgentBlacklisted"
AgentEnabled = "AgentEnabled"
AgentMemory = "AgentMemory"
AgentUpgrade = "AgentUpgrade"
ArtifactsProfileBlob = "ArtifactsProfileBlob"
CGroupsCleanUp = "CGroupsCleanUp"
Expand Down
8 changes: 8 additions & 0 deletions azurelinuxagent/common/exception.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,14 @@ def __init__(self, msg=None, inner=None):
super(AgentConfigError, self).__init__(msg, inner)


class AgentMemoryExceededException(AgentError):
"""
When Agent memory limit reached.
"""
def __init__(self, msg=None, inner=None):
super(AgentMemoryExceededException, self).__init__(msg, inner)


class AgentNetworkError(AgentError):
"""
When network is not available.
Expand Down
Loading

0 comments on commit 28345a5

Please sign in to comment.