Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

don't report CPU metrics if it's negative value #2538

Merged
merged 3 commits into from
Mar 18, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 12 additions & 5 deletions azurelinuxagent/common/cgroup.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,7 +120,9 @@ def is_active(self):

def get_tracked_metrics(self, **_):
"""
Retrieves the current value of the metrics tracked for this cgroup and returns them as an array
Retrieves the current value of the metrics tracked for this cgroup and returns them as an array.

Note: Agent won't track the metrics if the current cpu ticks less than previous value and returns empty array.
"""
raise NotImplementedError()

Expand Down Expand Up @@ -241,11 +243,16 @@ def get_throttled_time(self):
return float(self._current_throttled_time - self._previous_throttled_time) / 1E9

def get_tracked_metrics(self, **kwargs):
tracked = [
MetricValue(MetricsCategory.CPU_CATEGORY, MetricsCounter.PROCESSOR_PERCENT_TIME, self.name, self.get_cpu_usage()),
]
tracked = []
cpu_usage = self.get_cpu_usage()
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

updating previous/current values happens inside these get functions on every poll, so no explicit logic needed to reset the values.

if cpu_usage >= float(0):
tracked.append(MetricValue(MetricsCategory.CPU_CATEGORY, MetricsCounter.PROCESSOR_PERCENT_TIME, self.name, cpu_usage))

if 'track_throttled_time' in kwargs and kwargs['track_throttled_time']:
tracked.append(MetricValue(MetricsCategory.CPU_CATEGORY, MetricsCounter.THROTTLED_TIME, self.name, self.get_throttled_time()))
throttled_time = self.get_throttled_time()
if cpu_usage >= float(0) and throttled_time >= float(0):
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In this case, very likely throttled time could have wrong value too. So, I decided not to report both.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we do not need to check for the 2 conditions, if the single condition cpu_usage < 0 is true, we should not report the 2 metrics

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yeah, if cpu_usage < 0 is true then it won't check second condition. I added second condition for rare case.

tracked.append(MetricValue(MetricsCategory.CPU_CATEGORY, MetricsCounter.THROTTLED_TIME, self.name, throttled_time))

return tracked


Expand Down
35 changes: 35 additions & 0 deletions tests/common/test_cgroupstelemetry.py
Original file line number Diff line number Diff line change
Expand Up @@ -374,3 +374,38 @@ def test_extension_telemetry_not_sent_for_empty_perf_metrics(self, *args): # py
metrics = CGroupsTelemetry.poll_all_tracked()
self.assertEqual(0, len(metrics))

@patch("azurelinuxagent.common.cgroup.CpuCgroup.get_cpu_usage")
@patch("azurelinuxagent.common.cgroup.CpuCgroup.get_throttled_time")
@patch("azurelinuxagent.common.cgroup.CGroup.is_active")
def test_cgroup_telemetry_should_not_report_cpu_negative_value(self, patch_is_active, path_get_throttled_time, patch_get_cpu_usage):

num_polls = 5
num_extensions = 1

# only verifying calculations and not validity of the values.
cpu_percent_values = [random.randint(0, 100) for _ in range(num_polls-1)]
cpu_percent_values.append(-1)
cpu_throttled_values = [random.randint(0, 60 * 60) for _ in range(num_polls)]

dummy_cpu_cgroup = CpuCgroup("dummy_extension_name", "dummy_cpu_path")
CGroupsTelemetry.track_cgroup(dummy_cpu_cgroup)
self.assertEqual(1, len(CGroupsTelemetry._tracked))

for i in range(num_polls):
patch_is_active.return_value = True
patch_get_cpu_usage.return_value = cpu_percent_values[i]
path_get_throttled_time.return_value = cpu_throttled_values[i]

CGroupsTelemetry._track_throttled_time = True
metrics = CGroupsTelemetry.poll_all_tracked()

# 1 CPU metric + 1 CPU throttled
# ignore CPU metrics from telemetry if cpu cgroup reports negative value
if i < num_polls-1:
self.assertEqual(len(metrics), 2 * num_extensions)
else:
self.assertEqual(len(metrics), 0)

for metric in metrics:
self.assertGreaterEqual(metric.value, 0, "telemetry should not report negative value")