Skip to content

Commit

Permalink
perf: Add per event clockid support
Browse files Browse the repository at this point in the history
While thinking on the whole clock discussion it occurred to me we have
two distinct uses of time:

 1) the tracking of event/ctx/cgroup enabled/running/stopped times
    which includes the self-monitoring support in struct
    perf_event_mmap_page.

 2) the actual timestamps visible in the data records.

And we've been conflating them.

The first is all about tracking time deltas, nobody should really care
in what time base that happens, its all relative information, as long
as its internally consistent it works.

The second however is what people are worried about when having to
merge their data with external sources. And here we have the
discussion on MONOTONIC vs MONOTONIC_RAW etc..

Where MONOTONIC is good for correlating between machines (static
offset), MONOTNIC_RAW is required for correlating against a fixed rate
hardware clock.

This means configurability; now 1) makes that hard because it needs to
be internally consistent across groups of unrelated events; which is
why we had to have a global perf_clock().

However, for 2) it doesn't really matter, perf itself doesn't care
what it writes into the buffer.

The below patch makes the distinction between these two cases by
adding perf_event_clock() which is used for the second case. It
further makes this configurable on a per-event basis, but adds a few
sanity checks such that we cannot combine events with different clocks
in confusing ways.

And since we then have per-event configurability we might as well
retain the 'legacy' behaviour as a default.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: John Stultz <john.stultz@linaro.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
  • Loading branch information
Peter Zijlstra authored and Ingo Molnar committed Mar 27, 2015
1 parent b381e63 commit 34f4392
Show file tree
Hide file tree
Showing 4 changed files with 91 additions and 8 deletions.
14 changes: 12 additions & 2 deletions arch/x86/kernel/cpu/perf_event.c
Original file line number Diff line number Diff line change
Expand Up @@ -1978,13 +1978,23 @@ void arch_perf_update_userpage(struct perf_event *event,

data = cyc2ns_read_begin();

/*
* Internal timekeeping for enabled/running/stopped times
* is always in the local_clock domain.
*/
userpg->cap_user_time = 1;
userpg->time_mult = data->cyc2ns_mul;
userpg->time_shift = data->cyc2ns_shift;
userpg->time_offset = data->cyc2ns_offset - now;

userpg->cap_user_time_zero = 1;
userpg->time_zero = data->cyc2ns_offset;
/*
* cap_user_time_zero doesn't make sense when we're using a different
* time base for the records.
*/
if (event->clock == &local_clock) {
userpg->cap_user_time_zero = 1;
userpg->time_zero = data->cyc2ns_offset;
}

cyc2ns_read_end(data);
}
Expand Down
2 changes: 2 additions & 0 deletions include/linux/perf_event.h
Original file line number Diff line number Diff line change
Expand Up @@ -173,6 +173,7 @@ struct perf_event;
* pmu::capabilities flags
*/
#define PERF_PMU_CAP_NO_INTERRUPT 0x01
#define PERF_PMU_CAP_NO_NMI 0x02

/**
* struct pmu - generic performance monitoring unit
Expand Down Expand Up @@ -457,6 +458,7 @@ struct perf_event {
struct pid_namespace *ns;
u64 id;

u64 (*clock)(void);
perf_overflow_handler_t overflow_handler;
void *overflow_handler_context;

Expand Down
6 changes: 3 additions & 3 deletions include/uapi/linux/perf_event.h
Original file line number Diff line number Diff line change
Expand Up @@ -326,7 +326,8 @@ struct perf_event_attr {
exclude_callchain_user : 1, /* exclude user callchains */
mmap2 : 1, /* include mmap with inode data */
comm_exec : 1, /* flag comm events that are due to an exec */
__reserved_1 : 39;
use_clockid : 1, /* use @clockid for time fields */
__reserved_1 : 38;

union {
__u32 wakeup_events; /* wakeup every n events */
Expand Down Expand Up @@ -355,8 +356,7 @@ struct perf_event_attr {
*/
__u32 sample_stack_user;

/* Align to u64. */
__u32 __reserved_2;
__s32 clockid;
/*
* Defines set of regs to dump for each sample
* state captured on:
Expand Down
77 changes: 74 additions & 3 deletions kernel/events/core.c
Original file line number Diff line number Diff line change
Expand Up @@ -327,6 +327,11 @@ static inline u64 perf_clock(void)
return local_clock();
}

static inline u64 perf_event_clock(struct perf_event *event)
{
return event->clock();
}

static inline struct perf_cpu_context *
__get_cpu_context(struct perf_event_context *ctx)
{
Expand Down Expand Up @@ -4762,7 +4767,7 @@ static void __perf_event_header__init_id(struct perf_event_header *header,
}

if (sample_type & PERF_SAMPLE_TIME)
data->time = perf_clock();
data->time = perf_event_clock(event);

if (sample_type & (PERF_SAMPLE_ID | PERF_SAMPLE_IDENTIFIER))
data->id = primary_event_id(event);
Expand Down Expand Up @@ -5340,6 +5345,8 @@ static void perf_event_task_output(struct perf_event *event,
task_event->event_id.tid = perf_event_tid(event, task);
task_event->event_id.ptid = perf_event_tid(event, current);

task_event->event_id.time = perf_event_clock(event);

perf_output_put(&handle, task_event->event_id);

perf_event__output_id_sample(event, &handle, &sample);
Expand Down Expand Up @@ -5373,7 +5380,7 @@ static void perf_event_task(struct task_struct *task,
/* .ppid */
/* .tid */
/* .ptid */
.time = perf_clock(),
/* .time */
},
};

Expand Down Expand Up @@ -5749,7 +5756,7 @@ static void perf_log_throttle(struct perf_event *event, int enable)
.misc = 0,
.size = sizeof(throttle_event),
},
.time = perf_clock(),
.time = perf_event_clock(event),
.id = primary_event_id(event),
.stream_id = event->id,
};
Expand Down Expand Up @@ -6293,6 +6300,8 @@ static int perf_swevent_init(struct perf_event *event)
static struct pmu perf_swevent = {
.task_ctx_nr = perf_sw_context,

.capabilities = PERF_PMU_CAP_NO_NMI,

.event_init = perf_swevent_init,
.add = perf_swevent_add,
.del = perf_swevent_del,
Expand Down Expand Up @@ -6636,6 +6645,8 @@ static int cpu_clock_event_init(struct perf_event *event)
static struct pmu perf_cpu_clock = {
.task_ctx_nr = perf_sw_context,

.capabilities = PERF_PMU_CAP_NO_NMI,

.event_init = cpu_clock_event_init,
.add = cpu_clock_event_add,
.del = cpu_clock_event_del,
Expand Down Expand Up @@ -6715,6 +6726,8 @@ static int task_clock_event_init(struct perf_event *event)
static struct pmu perf_task_clock = {
.task_ctx_nr = perf_sw_context,

.capabilities = PERF_PMU_CAP_NO_NMI,

.event_init = task_clock_event_init,
.add = task_clock_event_add,
.del = task_clock_event_del,
Expand Down Expand Up @@ -7200,6 +7213,10 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
event->hw.target = task;
}

event->clock = &local_clock;
if (parent_event)
event->clock = parent_event->clock;

if (!overflow_handler && parent_event) {
overflow_handler = parent_event->overflow_handler;
context = parent_event->overflow_handler_context;
Expand Down Expand Up @@ -7422,6 +7439,12 @@ perf_event_set_output(struct perf_event *event, struct perf_event *output_event)
if (output_event->cpu == -1 && output_event->ctx != event->ctx)
goto out;

/*
* Mixing clocks in the same buffer is trouble you don't need.
*/
if (output_event->clock != event->clock)
goto out;

set:
mutex_lock(&event->mmap_mutex);
/* Can't redirect output if we've got an active mmap() */
Expand Down Expand Up @@ -7454,6 +7477,43 @@ static void mutex_lock_double(struct mutex *a, struct mutex *b)
mutex_lock_nested(b, SINGLE_DEPTH_NESTING);
}

static int perf_event_set_clock(struct perf_event *event, clockid_t clk_id)
{
bool nmi_safe = false;

switch (clk_id) {
case CLOCK_MONOTONIC:
event->clock = &ktime_get_mono_fast_ns;
nmi_safe = true;
break;

case CLOCK_MONOTONIC_RAW:
event->clock = &ktime_get_raw_fast_ns;
nmi_safe = true;
break;

case CLOCK_REALTIME:
event->clock = &ktime_get_real_ns;
break;

case CLOCK_BOOTTIME:
event->clock = &ktime_get_boot_ns;
break;

case CLOCK_TAI:
event->clock = &ktime_get_tai_ns;
break;

default:
return -EINVAL;
}

if (!nmi_safe && !(event->pmu->capabilities & PERF_PMU_CAP_NO_NMI))
return -EINVAL;

return 0;
}

/**
* sys_perf_event_open - open a performance event, associate it to a task/cpu
*
Expand Down Expand Up @@ -7569,6 +7629,12 @@ SYSCALL_DEFINE5(perf_event_open,
*/
pmu = event->pmu;

if (attr.use_clockid) {
err = perf_event_set_clock(event, attr.clockid);
if (err)
goto err_alloc;
}

if (group_leader &&
(is_software_event(event) != is_software_event(group_leader))) {
if (is_software_event(event)) {
Expand Down Expand Up @@ -7618,6 +7684,11 @@ SYSCALL_DEFINE5(perf_event_open,
*/
if (group_leader->group_leader != group_leader)
goto err_context;

/* All events in a group should have the same clock */
if (group_leader->clock != event->clock)
goto err_context;

/*
* Do not allow to attach to a group in a different
* task or CPU context:
Expand Down

0 comments on commit 34f4392

Please sign in to comment.