Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

adding use_oneshot parameter to speed up check runs with psutil.Process().oneshot() #17817

Merged
merged 8 commits into from
Jun 21, 2024
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions process/assets/configuration/spec.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -140,4 +140,11 @@ files:
value:
type: integer
example: 120
- name: use_oneshot
description: |
If set to `true`, the check uses `psutil.Process().oneshot()` to collect and cache process metrics.
This can help speed up the check completion.
value:
type: boolean
example: true
- template: instances/default
1 change: 1 addition & 0 deletions process/changelog.d/17817.added
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
adding `use_oneshot` parameter to speed up check runs with `psutil.Process().oneshot()`
clarkb7 marked this conversation as resolved.
Show resolved Hide resolved
4 changes: 4 additions & 0 deletions process/datadog_checks/process/config_models/defaults.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,3 +50,7 @@ def instance_pid_cache_duration():

def instance_try_sudo():
return False


def instance_use_oneshot():
return True
1 change: 1 addition & 0 deletions process/datadog_checks/process/config_models/instance.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ class InstanceConfig(BaseModel):
tags: Optional[tuple[str, ...]] = None
thresholds: Optional[MappingProxyType[str, Any]] = None
try_sudo: Optional[bool] = None
use_oneshot: Optional[bool] = None
user: Optional[str] = None

@model_validator(mode='before')
Expand Down
6 changes: 6 additions & 0 deletions process/datadog_checks/process/data/conf.yaml.example
Original file line number Diff line number Diff line change
Expand Up @@ -136,6 +136,12 @@ instances:
#
# pid_cache_duration: 120

## @param use_oneshot - boolean - optional - default: true
## If set to `true`, the check uses `psutil.Process().oneshot()` to collect and cache process metrics.
## This can help speed up the check completion.
#
# use_oneshot: true

## @param tags - list of strings - optional
## A list of tags to attach to every metric and service check emitted by this instance.
##
Expand Down
123 changes: 67 additions & 56 deletions process/datadog_checks/process/process.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,7 @@ def __init__(self, name, init_config, instances):
self.collect_children = is_affirmative(self.instance.get('collect_children', False))
self.user = self.instance.get('user', False)
self.try_sudo = self.instance.get('try_sudo', False)
self.use_oneshot = is_affirmative(self.instance.get('use_oneshot', True))

# ad stands for access denied
# We cache the PIDs getting this error and don't iterate on them more often than `access_denied_cache_duration``
Expand Down Expand Up @@ -293,65 +294,75 @@ def get_process_state(self, name, pids):

p = self.process_cache[name][pid]

meminfo = self.psutil_wrapper(p, 'memory_info', ['rss', 'vms'])
st['rss'].append(meminfo.get('rss'))
st['vms'].append(meminfo.get('vms'))
if self.use_oneshot:
self.log.debug("Using psutil Process.oneshot()")
with p.oneshot():
st = self.run_psutil_methods(pid, p, st, new_process)
else:
st = self.run_psutil_methods(pid, p, st, new_process)

mem_percent = self.psutil_wrapper(p, 'memory_percent')
st['mem_pct'].append(mem_percent)
return st

# will fail on win32 and solaris
shared_mem = self.psutil_wrapper(p, 'memory_info', ['shared']).get('shared')
if shared_mem is not None and meminfo.get('rss') is not None:
st['real'].append(meminfo['rss'] - shared_mem)
else:
st['real'].append(None)

ctxinfo = self.psutil_wrapper(p, 'num_ctx_switches', ['voluntary', 'involuntary'])
st['ctx_swtch_vol'].append(ctxinfo.get('voluntary'))
st['ctx_swtch_invol'].append(ctxinfo.get('involuntary'))

st['thr'].append(self.psutil_wrapper(p, 'num_threads'))

cpu_percent = self.psutil_wrapper(p, 'cpu_percent')
cpu_count = psutil.cpu_count()
if not new_process:
# psutil returns `0.` for `cpu_percent` the
# first time it's sampled on a process,
# so save the value only on non-new processes
st['cpu'].append(cpu_percent)
if cpu_count > 0 and cpu_percent is not None:
st['cpu_norm'].append(cpu_percent / cpu_count)
else:
self.log.debug('could not calculate the normalized cpu pct, cpu_count: %s', cpu_count)
st['open_fd'].append(self.psutil_wrapper(p, 'num_fds'))
st['open_handle'].append(self.psutil_wrapper(p, 'num_handles'))

ioinfo = self.psutil_wrapper(p, 'io_counters', ['read_count', 'write_count', 'read_bytes', 'write_bytes'])
st['r_count'].append(ioinfo.get('read_count'))
st['w_count'].append(ioinfo.get('write_count'))
st['r_bytes'].append(ioinfo.get('read_bytes'))
st['w_bytes'].append(ioinfo.get('write_bytes'))

pagefault_stats = self.get_pagefault_stats(pid)
if pagefault_stats is not None:
(minflt, cminflt, majflt, cmajflt) = pagefault_stats
st['minflt'].append(minflt)
st['cminflt'].append(cminflt)
st['majflt'].append(majflt)
st['cmajflt'].append(cmajflt)
def run_psutil_methods(self, pid, p, st, new_process):
meminfo = self.psutil_wrapper(p, 'memory_info', ['rss', 'vms'])
st['rss'].append(meminfo.get('rss'))
st['vms'].append(meminfo.get('vms'))

mem_percent = self.psutil_wrapper(p, 'memory_percent')
st['mem_pct'].append(mem_percent)

# will fail on win32 and solaris
shared_mem = self.psutil_wrapper(p, 'memory_info', ['shared']).get('shared')
if shared_mem is not None and meminfo.get('rss') is not None:
st['real'].append(meminfo['rss'] - shared_mem)
else:
st['real'].append(None)

ctxinfo = self.psutil_wrapper(p, 'num_ctx_switches', ['voluntary', 'involuntary'])
st['ctx_swtch_vol'].append(ctxinfo.get('voluntary'))
st['ctx_swtch_invol'].append(ctxinfo.get('involuntary'))

st['thr'].append(self.psutil_wrapper(p, 'num_threads'))

cpu_percent = self.psutil_wrapper(p, 'cpu_percent')
cpu_count = psutil.cpu_count()
if not new_process:
# psutil returns `0.` for `cpu_percent` the
# first time it's sampled on a process,
# so save the value only on non-new processes
st['cpu'].append(cpu_percent)
if cpu_count > 0 and cpu_percent is not None:
st['cpu_norm'].append(cpu_percent / cpu_count)
else:
st['minflt'].append(None)
st['cminflt'].append(None)
st['majflt'].append(None)
st['cmajflt'].append(None)

# calculate process run time
create_time = self.psutil_wrapper(p, 'create_time')
if create_time is not None:
now = time.time()
run_time = now - create_time
st['run_time'].append(run_time)
self.log.debug('could not calculate the normalized cpu pct, cpu_count: %s', cpu_count)
st['open_fd'].append(self.psutil_wrapper(p, 'num_fds'))
st['open_handle'].append(self.psutil_wrapper(p, 'num_handles'))

ioinfo = self.psutil_wrapper(p, 'io_counters', ['read_count', 'write_count', 'read_bytes', 'write_bytes'])
st['r_count'].append(ioinfo.get('read_count'))
st['w_count'].append(ioinfo.get('write_count'))
st['r_bytes'].append(ioinfo.get('read_bytes'))
st['w_bytes'].append(ioinfo.get('write_bytes'))

pagefault_stats = self.get_pagefault_stats(pid)
if pagefault_stats is not None:
(minflt, cminflt, majflt, cmajflt) = pagefault_stats
st['minflt'].append(minflt)
st['cminflt'].append(cminflt)
st['majflt'].append(majflt)
st['cmajflt'].append(cmajflt)
else:
st['minflt'].append(None)
st['cminflt'].append(None)
st['majflt'].append(None)
st['cmajflt'].append(None)

# calculate process run time
create_time = self.psutil_wrapper(p, 'create_time')
if create_time is not None:
now = time.time()
run_time = now - create_time
st['run_time'].append(run_time)

return st

Expand Down
2 changes: 2 additions & 0 deletions process/hatch.toml
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,5 @@

[[envs.default.matrix]]
python = ["2.7", "3.11"]

[envs.bench]
36 changes: 36 additions & 0 deletions process/tests/test_bench.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
# (C) Datadog, Inc. 2018-present
mrafi97 marked this conversation as resolved.
Show resolved Hide resolved
# All rights reserved
# Licensed under a 3-clause BSD style license (see LICENSE)
from datadog_checks.process import ProcessCheck

from . import common


def test_run(benchmark, dd_run_check):
instance = {
'name': 'py',
'search_string': ['python'],
'exact_match': False,
'ignored_denied_access': True,
'use_oneshot': False,
'thresholds': {'warning': [1, 10], 'critical': [1, 100]},
}
process = ProcessCheck(common.CHECK_NAME, {}, [instance])
dd_run_check(process)

benchmark(dd_run_check, process)


def test_run_oneshot(benchmark, dd_run_check):
instance = {
'name': 'py',
'search_string': ['python'],
'exact_match': False,
'ignored_denied_access': True,
'use_oneshot': True,
'thresholds': {'warning': [1, 10], 'critical': [1, 100]},
}
process = ProcessCheck(common.CHECK_NAME, {}, [instance])
dd_run_check(process)

benchmark(dd_run_check, process)
15 changes: 14 additions & 1 deletion process/tests/test_process.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,17 @@ def is_running(self):
def children(self, recursive=False):
return []

# https://stackoverflow.com/questions/5093382/object-becomes-none-when-using-a-context-manager
def oneshot(self):
class MockOneShot(object):
def __enter__(self):
return self

def __exit__(self, type, value, traceback):
pass

return MockOneShot()


class NamedMockProcess(object):
def __init__(self, name):
Expand Down Expand Up @@ -227,7 +238,8 @@ def test_check_missing_process(aggregator, dd_run_check, caplog):
assert "Unable to find process named ['fooprocess', '/usr/bin/foo'] among processes" in caplog.text


def test_check_real_process(aggregator, dd_run_check):
@pytest.mark.parametrize("oneshot", [True, False])
def test_check_real_process(aggregator, dd_run_check, oneshot):
"Check that we detect python running (at least this process)"
from datadog_checks.base.utils.platform import Platform

Expand All @@ -237,6 +249,7 @@ def test_check_real_process(aggregator, dd_run_check):
'exact_match': False,
'ignored_denied_access': True,
'thresholds': {'warning': [1, 10], 'critical': [1, 100]},
'use_oneshot': oneshot,
}
process = ProcessCheck(common.CHECK_NAME, {}, [instance])
expected_tags = generate_expected_tags(instance)
Expand Down
Loading