-
Notifications
You must be signed in to change notification settings - Fork 372
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
enforce memory usage for agent #2671
Changes from 1 commit
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -136,6 +136,7 @@ def load_conf_from_file(conf_file_path, conf=__conf__): | |
"Debug.CgroupLogMetrics": False, | ||
"Debug.CgroupDisableOnProcessCheckFailure": True, | ||
"Debug.CgroupDisableOnQuotaCheckFailure": True, | ||
"Debug.EnableAgentMemoryUsageCheck": False, | ||
"Debug.EnableFastTrack": True, | ||
"Debug.EnableGAVersioning": False | ||
} | ||
|
@@ -186,6 +187,7 @@ def load_conf_from_file(conf_file_path, conf=__conf__): | |
"Debug.CgroupCheckPeriod": 300, | ||
"Debug.AgentCpuQuota": 50, | ||
"Debug.AgentCpuThrottledTimeThreshold": 120, | ||
"Debug.AgentMemoryQuota": 31457280, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can we use the same notation as below? (30 * 1024 ** 2) |
||
"Debug.EtpCollectionPeriod": 300, | ||
"Debug.AutoUpdateHotfixFrequency": 14400, | ||
"Debug.AutoUpdateNormalFrequency": 86400, | ||
|
@@ -555,6 +557,24 @@ def get_agent_cpu_throttled_time_threshold(conf=__conf__): | |
return conf.get_int("Debug.AgentCpuThrottledTimeThreshold", 120) | ||
|
||
|
||
def get_agent_memory_quota(conf=__conf__): | ||
""" | ||
Memory quota for the agent in bytes. | ||
|
||
NOTE: This option is experimental and may be removed in later versions of the Agent. | ||
""" | ||
return conf.get_int("Debug.AgentMemoryQuota", 30 * 1024 ** 2) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is just a placeholder value. We will update once we have concrete value. |
||
|
||
|
||
def get_enable_agent_memory_usage_check(conf=__conf__): | ||
""" | ||
If True, Agent checks it's Memory usage. | ||
|
||
NOTE: This option is experimental and may be removed in later versions of the Agent. | ||
""" | ||
return conf.get_switch("Debug.EnableAgentMemoryUsageCheck", False) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. feature set to false for now |
||
|
||
|
||
def get_cgroup_monitor_expiry_time(conf=__conf__): | ||
""" | ||
cgroups monitoring for pilot extensions disabled after expiry time | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -39,7 +39,8 @@ | |
from azurelinuxagent.common.cgroupconfigurator import CGroupConfigurator | ||
from azurelinuxagent.common.event import add_event, initialize_event_logger_vminfo_common_parameters, \ | ||
WALAEventOperation, EVENTS_DIRECTORY | ||
from azurelinuxagent.common.exception import ResourceGoneError, UpdateError, ExitException, AgentUpgradeExitException | ||
from azurelinuxagent.common.exception import ResourceGoneError, UpdateError, ExitException, AgentUpgradeExitException, \ | ||
CGroupsException | ||
from azurelinuxagent.common.future import ustr | ||
from azurelinuxagent.common.osutil import get_osutil, systemd | ||
from azurelinuxagent.common.persist_firewall_rules import PersistFirewallRulesHandler | ||
|
@@ -137,6 +138,7 @@ def get_update_handler(): | |
|
||
class UpdateHandler(object): | ||
TELEMETRY_HEARTBEAT_PERIOD = timedelta(minutes=30) | ||
CHECK_MEMORY_USAGE_PERIOD = timedelta(seconds=conf.get_cgroup_check_period()) | ||
|
||
def __init__(self): | ||
self.osutil = get_osutil() | ||
|
@@ -162,6 +164,9 @@ def __init__(self): | |
self._heartbeat_id = str(uuid.uuid4()).upper() | ||
self._heartbeat_counter = 0 | ||
|
||
self._last_check_memory_usage = datetime.min | ||
self._check_memory_usage_last_error_report = datetime.min | ||
|
||
# VM Size is reported via the heartbeat, default it here. | ||
self._vm_size = None | ||
|
||
|
@@ -401,6 +406,7 @@ def run(self, debug=False): | |
self._check_threads_running(all_thread_handlers) | ||
self._process_goal_state(exthandlers_handler, remote_access_handler) | ||
self._send_heartbeat_telemetry(protocol) | ||
self._check_agent_memory_usage() | ||
time.sleep(self._goal_state_period) | ||
|
||
except AgentUpgradeExitException as exitException: | ||
|
@@ -1288,6 +1294,27 @@ def _send_heartbeat_telemetry(self, protocol): | |
self._heartbeat_update_goal_state_error_count = 0 | ||
self._last_telemetry_heartbeat = datetime.utcnow() | ||
|
||
def _check_agent_memory_usage(self): | ||
""" | ||
This checks the agent current memory usage and safely exit the process if agent reaches the memory limit | ||
""" | ||
try: | ||
if conf.get_enable_agent_memory_usage_check() and self._extensions_summary.converged: | ||
if self._last_check_memory_usage == datetime.min or datetime.utcnow() >= (self._last_check_memory_usage + UpdateHandler.CHECK_MEMORY_USAGE_PERIOD): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This would check if goal state completed and also, cgroup monitor period. Note: I'm using same cgroup monitor period to run this check assuming that we evaluate and calculate memory limit based on the values we get from monitoring thread. |
||
self._last_check_memory_usage = datetime.utcnow() | ||
CGroupConfigurator.get_instance().check_agent_memory_usage() | ||
except CGroupsException as exception: | ||
msg = "Check on agent memory usage:\n{0}".format(ustr(exception)) | ||
logger.info(msg) | ||
add_event(AGENT_NAME, op=WALAEventOperation.CGroupsInfo, is_success=True, message=msg) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think we should have a new WALAEventOperation for this event |
||
raise ExitException("Agent {0} is reached memory limit -- exiting".format(CURRENT_AGENT)) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This ExitException and rest of the logic of sys.exit already handled in main thread for safe exit. |
||
except Exception as exception: | ||
if self._check_memory_usage_last_error_report == datetime.min or (self._check_memory_usage_last_error_report + timedelta(hours=6)) > datetime.now(): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is to avoid flooding error msgs in agent log There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Cool. Could also add a short string such as "[Won't report the same error for 6 hours]"? |
||
self._check_memory_usage_last_error_report = datetime.now() | ||
msg = "Error checking the agent's memory usage: {0}".format(ustr(exception)) | ||
logger.warn(msg) | ||
add_event(AGENT_NAME, op=WALAEventOperation.CGroupsInfo, is_success=False, message=msg) | ||
|
||
@staticmethod | ||
def _ensure_extension_telemetry_state_configured_properly(protocol): | ||
etp_enabled = get_supported_feature_by_name(SupportedFeatureNames.ExtensionTelemetryPipeline).is_supported | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Let's use a new exception type for this (AgentMemoryExceededException?). CGroupsException is too generic, and also in general the code handles cgroups issues in such a way that the correspoding cgroup task becomes a no-op.