Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix bugs & refactor #31

Merged
merged 8 commits into from
Feb 23, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 0 additions & 13 deletions driver/bpf/maps.h
Original file line number Diff line number Diff line change
Expand Up @@ -136,19 +136,6 @@ struct info_t {
u8 time_type[NUM];
};

#define TYPE_NUM 8
struct time_aggregate_t {
u32 pid;
u32 tid;
char comm[TASK_COMM_LEN]; // 16
u64 start_time;
// on_total_time, off_total_time;
u64 total_times[2];
// net, io, futex, idle, other
// 0, 1, 2, 3, 4
u64 time_specs[TYPE_NUM]; // 展开成各个参数;1 + 2 + 4
};

struct bpf_map_def __bpf_section("maps") on_start_ts = {
.type = BPF_MAP_TYPE_HASH,
.key_size = sizeof(u32),
Expand Down
83 changes: 23 additions & 60 deletions driver/bpf/plumbing_helpers.h
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,6 @@ or GPL2.txt for full copies of the license.
bpf_probe_read((void *)dst, size, (char *)ctx + __offset); \
} while (0);

//#define BPF_DEBUG
#ifdef BPF_DEBUG
#define bpf_printk(fmt, ...) \
do { \
Expand All @@ -50,15 +49,14 @@ static __always_inline bool prepare_filler(void *ctx,
enum ppm_event_type evt_type,
struct sysdig_bpf_settings *settings,
enum syscall_flags drop_flags);
static __always_inline int bpf_cpu_analysis(void *ctx, u32 tid);
#ifdef CPU_ANALYSIS
static __always_inline int bpf_cpu_analysis(void *ctx, u32 tid);
static __always_inline void clear_map(u32 tid)
{
bpf_map_delete_elem(&type_map, &tid);
bpf_map_delete_elem(&on_start_ts, &tid);
bpf_map_delete_elem(&off_start_ts, &tid);
bpf_map_delete_elem(&cpu_focus_threads, &tid);
// bpf_map_delete_elem(&aggregate_time, &tid);
bpf_map_delete_elem(&cpu_records, &tid);
}

Expand Down Expand Up @@ -131,23 +129,29 @@ static __always_inline enum offcpu_type get_syscall_type(int syscall_id) {
bpf_map_update_elem(&syscall_map, &syscall_id, &type, BPF_ANY);
return type;
}

static __always_inline void record_cpu_offtime(void *ctx, struct sysdig_bpf_settings *settings, u32 pid, u32 tid, u64 start_ts, u64 latency, u64 delta)
static __always_inline struct info_t* get_cpu_info(u32 pid, u32 tid, u64 real_start_ts)
{
uint16_t switch_agg_num = settings->switch_agg_num;
struct info_t *infop;
infop = bpf_map_lookup_elem(&cpu_records, &tid);
if (infop == 0) { // try init
if (infop == 0) {
// init
struct info_t info = {0};
info.pid = pid;
info.tid = tid;
info.start_ts = settings->boot_time + start_ts;
info.start_ts = real_start_ts;
info.index = 0;
bpf_map_update_elem(&cpu_records, &tid, &info, BPF_ANY);
infop = bpf_map_lookup_elem(&cpu_records, &tid);
}

return infop;
}

static __always_inline void record_cpu_offtime(void *ctx, struct sysdig_bpf_settings *settings, u32 pid, u32 tid, u64 start_ts, u64 latency, u64 delta)
{
uint16_t switch_agg_num = settings->switch_agg_num;
struct info_t *infop = get_cpu_info(pid, tid, settings->boot_time + start_ts);

if (infop != 0) {
if (infop->index < switch_agg_num) {
infop->times_specs[infop->index & (NUM - 1)] = delta;
Expand All @@ -165,48 +169,36 @@ static __always_inline void record_cpu_offtime(void *ctx, struct sysdig_bpf_sett
}
// update end_ts
infop->end_ts = settings->boot_time + bpf_ktime_get_ns();
// cache
bpf_map_update_elem(&cpu_records, &tid, infop, BPF_ANY);
}
}

static __always_inline void record_cpu_ontime_and_out(void *ctx, struct sysdig_bpf_settings *settings, u32 pid, u32 tid, u64 start_ts, u64 delta)
{
uint16_t switch_agg_num = settings->switch_agg_num;
struct info_t *infop;
infop = bpf_map_lookup_elem(&cpu_records, &tid);
if (infop == 0) { // try init
// init
struct info_t info = {0};
info.pid = pid;
info.tid = tid;
info.start_ts = settings->boot_time + start_ts;
info.index = 0;
bpf_map_update_elem(&cpu_records, &tid, &info, BPF_ANY);
infop = bpf_map_lookup_elem(&cpu_records, &tid);
}
struct info_t *infop = get_cpu_info(pid, tid, start_ts);

if (infop != 0) {
enum offcpu_type *typep, type;
// get the type of offcpu
typep = bpf_map_lookup_elem(&type_map, &tid);
if (infop->index < switch_agg_num) {
infop->times_specs[infop->index & (NUM - 1)] = delta;
infop->index++;
}
// update end_ts
infop->end_ts = settings->boot_time + bpf_ktime_get_ns();
u64 *focus_time = bpf_map_lookup_elem(&cpu_focus_threads, &tid);

int offset_ts = infop->end_ts - infop->start_ts;

u64 *focus_time = bpf_map_lookup_elem(&cpu_focus_threads, &tid);
bool have_focus_events = false;
if(focus_time){
u64 ftime = settings->boot_time + *focus_time;
if(ftime > start_ts && ftime < start_ts + delta) have_focus_events = true;
if (focus_time) {
if (*focus_time > start_ts && *focus_time < start_ts + delta) have_focus_events = true;
}

/* Some situations will trigger perf out:
1. have focused events, e.g. net events
2. the times of task switches reach at a specfic number
3. the time range of task switches (namely offset_ts) exceeds threshold
*/
if (infop->index > 0 && (have_focus_events
|| infop->index == switch_agg_num || infop->index == switch_agg_num - 1 || offset_ts > 2000000000)) {
//bpf_printk("start_ts %llu", infop->start_ts);
// perf out
if (prepare_filler(ctx, ctx, PPME_CPU_ANALYSIS_E, settings, 0)) {
bpf_cpu_analysis(ctx, infop->tid);
Expand All @@ -218,35 +210,6 @@ static __always_inline void record_cpu_ontime_and_out(void *ctx, struct sysdig_b
memset(infop->times_specs, 0, sizeof(infop->times_specs));
memset(infop->rq, 0, sizeof(infop->rq));
}
// cache
bpf_map_update_elem(&cpu_records, &tid, infop, BPF_ANY);
}
}

static __always_inline void aggregate(u32 pid, u32 tid, u64 start_time, u64 current_interval, bool is_on)
{
struct time_aggregate_t* p_time = bpf_map_lookup_elem(&aggregate_time, &pid);
if (p_time == 0) {
struct time_aggregate_t time_aggregate = {};
time_aggregate.start_time = start_time;
bpf_map_update_elem(&aggregate_time, &pid, &time_aggregate, BPF_ANY);
p_time = bpf_map_lookup_elem(&aggregate_time, &pid);
}
if (p_time != 0) {
if (is_on) {
p_time->total_times[0] += current_interval;
} else {
enum offcpu_type *typep, type;
typep = bpf_map_lookup_elem(&type_map, &tid);
if (typep == 0) {
type = OTHER;
} else {
type = *typep;
}
p_time->total_times[1] += current_interval;
p_time->time_specs[((int)type - 1) & (TYPE_NUM - 1)] += current_interval;
}
bpf_map_update_elem(&aggregate_time, &pid, p_time, BPF_ANY);
}
}
#endif
Expand Down
110 changes: 18 additions & 92 deletions driver/bpf/probe.c
Original file line number Diff line number Diff line change
Expand Up @@ -213,6 +213,14 @@ BPF_PROBE("sched/", sched_switch, sched_switch_args)
#ifdef BPF_SUPPORTS_RAW_TRACEPOINTS
BPF_PROBE("sched/", sched_switch, sched_switch_args)
{
struct task_struct *p = (struct task_struct *) ctx->prev;
struct task_struct *n = (struct task_struct *) ctx->next;
#else
BPF_KPROBE(finish_task_switch)
{
struct task_struct *p = (struct task_struct *) ctx->si;
struct task_struct *n = (struct task_struct *) bpf_get_current_task();
#endif
struct sysdig_bpf_settings *settings;
enum ppm_event_type evt_type = PPME_CPU_ANALYSIS_E;

Expand All @@ -223,134 +231,51 @@ BPF_PROBE("sched/", sched_switch, sched_switch_args)
if (!settings->capture_enabled)
return 0;

if (evt_type < PPM_EVENT_MAX && !settings->events_mask[evt_type]) {
if (evt_type < PPM_EVENT_MAX && !settings->events_mask[evt_type])
return 0;
}

struct task_struct *p = (struct task_struct *) ctx->prev;
struct task_struct *n = (struct task_struct *) ctx->next;
u32 tid = _READ(p->pid);
u32 pid = _READ(p->tgid);
u64 ts, *tsp;
if (FILTER) {
if (_READ(p->state) == TASK_RUNNING) {
u64 ts = bpf_ktime_get_ns();
bpf_map_update_elem(&cpu_runq, &pid, &ts, BPF_ANY);
}
// record previous thread (current) sleep time
// record previous thread offcpu start time
ts = bpf_ktime_get_ns();
bpf_map_update_elem(&off_start_ts, &tid, &ts, BPF_ANY);

// calculate oncpu time, sleep time - &on_start_ts
// p is the focus thread, it switch off
u64 *on_ts;
on_ts = bpf_map_lookup_elem(&on_start_ts, &tid);
if (on_ts != 0) {
// calculate previous thread's oncpu delta time
u64 delta = ts - *on_ts;
u64 delta_us = delta / 1000; // convert to us
bpf_map_delete_elem(&on_start_ts, &tid);
if ((delta_us >= MINBLOCK_US) && (delta_us <= MAXBLOCK_US)) {
if (check_filter(pid)) {
record_cpu_ontime_and_out(ctx, settings, pid, tid, *on_ts, delta);
// aggregate(pid, tid, *on_ts, delta, 1);
}
}
}
}
// get the next thread's start time
tid = _READ(n->pid);
pid = _READ(n->tgid);
if (!(FILTER))
return 0;

// record oncpu start time
u64 on_ts = bpf_ktime_get_ns();
// record on start time
bpf_map_update_elem(&on_start_ts, &tid, &on_ts, BPF_ANY);

tsp = bpf_map_lookup_elem(&off_start_ts, &tid);
if (tsp != 0) {
u64 off_ts = *tsp;
bpf_map_delete_elem(&off_start_ts, &tid);
// calculate current thread's off delta time
u64 delta = on_ts - off_ts;
u64 delta_us = delta / 1000;
if ((delta_us >= MINBLOCK_US) && (delta_us <= MAXBLOCK_US)) {
if (check_filter(pid)) {
u64 *rq_ts = bpf_map_lookup_elem(&cpu_runq, &tid);
u64 rq_la = 0;
if (rq_ts != 0) {
if (on_ts > *rq_ts)
rq_la = (on_ts - *rq_ts) / 1000;
bpf_map_delete_elem(&cpu_runq, &tid);
}
record_cpu_offtime(ctx, settings, pid, tid, off_ts, rq_la, delta);
// aggregate(pid, tid, off_ts, delta, 0);
}
}
}
return 0;
}
#else
BPF_KPROBE(finish_task_switch)
{
struct sysdig_bpf_settings *settings;
enum ppm_event_type evt_type;

settings = get_bpf_settings();
if (!settings)
return 0;

if (!settings->capture_enabled)
return 0;

struct task_struct *p = (struct task_struct *) ctx->si;
u32 tid = _READ(p->pid);
u32 pid = _READ(p->tgid);
u64 ts, *tsp;
if (FILTER) {
// record enqueue time
if (_READ(p->state) == TASK_RUNNING) {
u64 ts = bpf_ktime_get_ns();
bpf_map_update_elem(&cpu_runq, &pid, &ts, BPF_ANY);
}
// record previous thread (current) sleep time
ts = bpf_ktime_get_ns();
bpf_map_update_elem(&off_start_ts, &tid, &ts, BPF_ANY);

// calculate oncpu time, sleep time - &on_start_ts
// p is the focus thread, it switch off
u64 *on_ts;
on_ts = bpf_map_lookup_elem(&on_start_ts, &tid);
if (on_ts != 0) {
u64 delta = ts - *on_ts;
u64 delta_us = delta / 1000; // convert to us
bpf_map_delete_elem(&on_start_ts, &tid);
if ((delta_us >= MINBLOCK_US) && (delta_us <= MAXBLOCK_US)) {
if (check_filter(pid)) {
record_cpu_ontime_and_out(ctx, settings, pid, tid, *on_ts, delta);
// aggregate(pid, tid, *on_ts, delta, 1);
}
}
bpf_map_update_elem(&cpu_runq, &tid, &ts, BPF_ANY);
}
}
// get the next thread's start time
struct task_struct *n = (struct task_struct *)bpf_get_current_task();

tid = _READ(n->pid);
pid = _READ(n->tgid);
if (!(FILTER))
return 0;

// record oncpu start time
// record next thread's oncpu start time
u64 on_ts = bpf_ktime_get_ns();
// record on start time
bpf_map_update_elem(&on_start_ts, &tid, &on_ts, BPF_ANY);

tsp = bpf_map_lookup_elem(&off_start_ts, &tid);
if (tsp != 0) {
u64 off_ts = *tsp;
bpf_map_delete_elem(&off_start_ts, &tid);
// calculate current thread's off delta time
// calculate next thread's offcpu delta time
u64 delta = on_ts - off_ts;
u64 delta_us = delta / 1000;
if ((delta_us >= MINBLOCK_US) && (delta_us <= MAXBLOCK_US)) {
Expand All @@ -363,13 +288,13 @@ BPF_KPROBE(finish_task_switch)
bpf_map_delete_elem(&cpu_runq, &tid);
}
record_cpu_offtime(ctx, settings, pid, tid, off_ts, rq_la, delta);
// aggregate(pid, tid, off_ts, delta, 0);
}
}
}

return 0;
}
#endif

static __always_inline int bpf_trace_enqueue(struct sched_process_exit_args *ctx)
{
#ifdef BPF_SUPPORTS_RAW_TRACEPOINTS
Expand All @@ -383,6 +308,7 @@ static __always_inline int bpf_trace_enqueue(struct sched_process_exit_args *ctx
return 0;
u64 ts = bpf_ktime_get_ns();
bpf_map_update_elem(&cpu_runq, &pid, &ts, BPF_ANY);

return 0;
}
BPF_PROBE("sched/", sched_wakeup_new, sched_process_exit_args)
Expand Down