Skip to content

Commit

Permalink
perf: Rewrite core context handling
Browse files Browse the repository at this point in the history
There have been various issues and limitations with the way perf uses
(task) contexts to track events. Most notable is the single hardware
PMU task context, which has resulted in a number of yucky things (both
proposed and merged).

Notably:
 - HW breakpoint PMU
 - ARM big.little PMU / Intel ADL PMU
 - Intel Branch Monitoring PMU
 - AMD IBS PMU
 - S390 cpum_cf PMU
 - PowerPC trace_imc PMU

*Current design:*

Currently we have a per task and per cpu perf_event_contexts:

  task_struct::perf_events_ctxp[] <-> perf_event_context <-> perf_cpu_context
       ^                                 |    ^     |           ^
       `---------------------------------'    |     `--> pmu ---'
                                              v           ^
                                         perf_event ------'

Each task has an array of pointers to a perf_event_context. Each
perf_event_context has a direct relation to a PMU and a group of
events for that PMU. The task related perf_event_context's have a
pointer back to that task.

Each PMU has a per-cpu pointer to a per-cpu perf_cpu_context, which
includes a perf_event_context, which again has a direct relation to
that PMU, and a group of events for that PMU.

The perf_cpu_context also tracks which task context is currently
associated with that CPU and includes a few other things like the
hrtimer for rotation etc.

Each perf_event is then associated with its PMU and one
perf_event_context.

*Proposed design:*

New design proposed by this patch reduce to a single task context and
a single CPU context but adds some intermediate data-structures:

  task_struct::perf_event_ctxp -> perf_event_context <- perf_cpu_context
       ^                           |   ^ ^
       `---------------------------'   | |
                                       | |    perf_cpu_pmu_context <--.
                                       | `----.    ^                  |
                                       |      |    |                  |
                                       |      v    v                  |
                                       | ,--> perf_event_pmu_context  |
                                       | |                            |
                                       | |                            |
                                       v v                            |
                                  perf_event ---> pmu ----------------'

With the new design, perf_event_context will hold all events for all
pmus in the (respective pinned/flexible) rbtrees. This can be achieved
by adding pmu to rbtree key:

  {cpu, pmu, cgroup, group_index}

Each perf_event_context carries a list of perf_event_pmu_context which
is used to hold per-pmu-per-context state. For example, it keeps track
of currently active events for that pmu, a pmu specific task_ctx_data,
a flag to tell whether rotation is required or not etc.

Additionally, perf_cpu_pmu_context is used to hold per-pmu-per-cpu
state like hrtimer details to drive the event rotation, a pointer to
perf_event_pmu_context of currently running task and some other
ancillary information.

Each perf_event is associated to it's pmu, perf_event_context and
perf_event_pmu_context.

Further optimizations to current implementation are possible. For
example, ctx_resched() can be optimized to reschedule only single pmu
events.

Much thanks to Ravi for picking this up and pushing it towards
completion.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Co-developed-by: Ravi Bangoria <ravi.bangoria@amd.com>
Signed-off-by: Ravi Bangoria <ravi.bangoria@amd.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20221008062424.313-1-ravi.bangoria@amd.com
  • Loading branch information
Peter Zijlstra committed Oct 27, 2022
1 parent 247f34f commit bd27568
Show file tree
Hide file tree
Showing 16 changed files with 1,178 additions and 1,094 deletions.
18 changes: 11 additions & 7 deletions arch/arm64/kernel/perf_event.c
Original file line number Diff line number Diff line change
Expand Up @@ -806,10 +806,14 @@ static void armv8pmu_disable_event(struct perf_event *event)

static void armv8pmu_start(struct arm_pmu *cpu_pmu)
{
struct perf_event_context *task_ctx =
this_cpu_ptr(cpu_pmu->pmu.pmu_cpu_context)->task_ctx;
struct perf_event_context *ctx;
int nr_user = 0;

if (sysctl_perf_user_access && task_ctx && task_ctx->nr_user)
ctx = perf_cpu_task_ctx();
if (ctx)
nr_user = ctx->nr_user;

if (sysctl_perf_user_access && nr_user)
armv8pmu_enable_user_access(cpu_pmu);
else
armv8pmu_disable_user_access();
Expand Down Expand Up @@ -1019,10 +1023,10 @@ static int armv8pmu_set_event_filter(struct hw_perf_event *event,
return 0;
}

static int armv8pmu_filter_match(struct perf_event *event)
static bool armv8pmu_filter(struct pmu *pmu, int cpu)
{
unsigned long evtype = event->hw.config_base & ARMV8_PMU_EVTYPE_EVENT;
return evtype != ARMV8_PMUV3_PERFCTR_CHAIN;
struct arm_pmu *armpmu = to_arm_pmu(pmu);
return !cpumask_test_cpu(smp_processor_id(), &armpmu->supported_cpus);
}

static void armv8pmu_reset(void *info)
Expand Down Expand Up @@ -1253,7 +1257,7 @@ static int armv8_pmu_init(struct arm_pmu *cpu_pmu, char *name,
cpu_pmu->stop = armv8pmu_stop;
cpu_pmu->reset = armv8pmu_reset;
cpu_pmu->set_event_filter = armv8pmu_set_event_filter;
cpu_pmu->filter_match = armv8pmu_filter_match;
cpu_pmu->filter = armv8pmu_filter;

cpu_pmu->pmu.event_idx = armv8pmu_user_event_idx;

Expand Down
8 changes: 4 additions & 4 deletions arch/powerpc/perf/core-book3s.c
Original file line number Diff line number Diff line change
Expand Up @@ -132,7 +132,7 @@ static unsigned long ebb_switch_in(bool ebb, struct cpu_hw_events *cpuhw)

static inline void power_pmu_bhrb_enable(struct perf_event *event) {}
static inline void power_pmu_bhrb_disable(struct perf_event *event) {}
static void power_pmu_sched_task(struct perf_event_context *ctx, bool sched_in) {}
static void power_pmu_sched_task(struct perf_event_pmu_context *pmu_ctx, bool sched_in) {}
static inline void power_pmu_bhrb_read(struct perf_event *event, struct cpu_hw_events *cpuhw) {}
static void pmao_restore_workaround(bool ebb) { }
#endif /* CONFIG_PPC32 */
Expand Down Expand Up @@ -424,7 +424,7 @@ static void power_pmu_bhrb_enable(struct perf_event *event)
cpuhw->bhrb_context = event->ctx;
}
cpuhw->bhrb_users++;
perf_sched_cb_inc(event->ctx->pmu);
perf_sched_cb_inc(event->pmu);
}

static void power_pmu_bhrb_disable(struct perf_event *event)
Expand All @@ -436,7 +436,7 @@ static void power_pmu_bhrb_disable(struct perf_event *event)

WARN_ON_ONCE(!cpuhw->bhrb_users);
cpuhw->bhrb_users--;
perf_sched_cb_dec(event->ctx->pmu);
perf_sched_cb_dec(event->pmu);

if (!cpuhw->disabled && !cpuhw->bhrb_users) {
/* BHRB cannot be turned off when other
Expand All @@ -451,7 +451,7 @@ static void power_pmu_bhrb_disable(struct perf_event *event)
/* Called from ctxsw to prevent one process's branch entries to
* mingle with the other process's entries during context switch.
*/
static void power_pmu_sched_task(struct perf_event_context *ctx, bool sched_in)
static void power_pmu_sched_task(struct perf_event_pmu_context *pmu_ctx, bool sched_in)
{
if (!ppmu->bhrb_nr)
return;
Expand Down
2 changes: 1 addition & 1 deletion arch/s390/kernel/perf_pai_crypto.c
Original file line number Diff line number Diff line change
Expand Up @@ -379,7 +379,7 @@ static int paicrypt_push_sample(void)
/* Called on schedule-in and schedule-out. No access to event structure,
* but for sampling only event CRYPTO_ALL is allowed.
*/
static void paicrypt_sched_task(struct perf_event_context *ctx, bool sched_in)
static void paicrypt_sched_task(struct perf_event_pmu_context *pmu_ctx, bool sched_in)
{
/* We started with a clean page on event installation. So read out
* results on schedule_out and if page was dirty, clear values.
Expand Down
2 changes: 1 addition & 1 deletion arch/s390/kernel/perf_pai_ext.c
Original file line number Diff line number Diff line change
Expand Up @@ -471,7 +471,7 @@ static int paiext_push_sample(void)
/* Called on schedule-in and schedule-out. No access to event structure,
* but for sampling only event NNPA_ALL is allowed.
*/
static void paiext_sched_task(struct perf_event_context *ctx, bool sched_in)
static void paiext_sched_task(struct perf_event_pmu_context *pmu_ctx, bool sched_in)
{
/* We started with a clean page on event installation. So read out
* results on schedule_out and if page was dirty, clear values.
Expand Down
2 changes: 1 addition & 1 deletion arch/x86/events/amd/brs.c
Original file line number Diff line number Diff line change
Expand Up @@ -384,7 +384,7 @@ static void amd_brs_poison_buffer(void)
* On ctxswin, sched_in = true, called after the PMU has started
* On ctxswout, sched_in = false, called before the PMU is stopped
*/
void amd_pmu_brs_sched_task(struct perf_event_context *ctx, bool sched_in)
void amd_pmu_brs_sched_task(struct perf_event_pmu_context *pmu_ctx, bool sched_in)
{
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);

Expand Down
6 changes: 3 additions & 3 deletions arch/x86/events/amd/lbr.c
Original file line number Diff line number Diff line change
Expand Up @@ -352,7 +352,7 @@ void amd_pmu_lbr_add(struct perf_event *event)
cpuc->br_sel = reg->reg;
}

perf_sched_cb_inc(event->ctx->pmu);
perf_sched_cb_inc(event->pmu);

if (!cpuc->lbr_users++ && !event->total_time_running)
amd_pmu_lbr_reset();
Expand All @@ -370,10 +370,10 @@ void amd_pmu_lbr_del(struct perf_event *event)

cpuc->lbr_users--;
WARN_ON_ONCE(cpuc->lbr_users < 0);
perf_sched_cb_dec(event->ctx->pmu);
perf_sched_cb_dec(event->pmu);
}

void amd_pmu_lbr_sched_task(struct perf_event_context *ctx, bool sched_in)
void amd_pmu_lbr_sched_task(struct perf_event_pmu_context *pmu_ctx, bool sched_in)
{
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);

Expand Down
44 changes: 14 additions & 30 deletions arch/x86/events/core.c
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,8 @@ DEFINE_STATIC_CALL_NULL(x86_pmu_swap_task_ctx, *x86_pmu.swap_task_ctx);
DEFINE_STATIC_CALL_NULL(x86_pmu_drain_pebs, *x86_pmu.drain_pebs);
DEFINE_STATIC_CALL_NULL(x86_pmu_pebs_aliases, *x86_pmu.pebs_aliases);

DEFINE_STATIC_CALL_NULL(x86_pmu_filter, *x86_pmu.filter);

/*
* This one is magic, it will get called even when PMU init fails (because
* there is no PMU), in which case it should simply return NULL.
Expand Down Expand Up @@ -2031,6 +2033,7 @@ static void x86_pmu_static_call_update(void)
static_call_update(x86_pmu_pebs_aliases, x86_pmu.pebs_aliases);

static_call_update(x86_pmu_guest_get_msrs, x86_pmu.guest_get_msrs);
static_call_update(x86_pmu_filter, x86_pmu.filter);
}

static void _x86_pmu_read(struct perf_event *event)
Expand All @@ -2052,23 +2055,6 @@ void x86_pmu_show_pmu_cap(int num_counters, int num_counters_fixed,
pr_info("... event mask: %016Lx\n", intel_ctrl);
}

/*
* The generic code is not hybrid friendly. The hybrid_pmu->pmu
* of the first registered PMU is unconditionally assigned to
* each possible cpuctx->ctx.pmu.
* Update the correct hybrid PMU to the cpuctx->ctx.pmu.
*/
void x86_pmu_update_cpu_context(struct pmu *pmu, int cpu)
{
struct perf_cpu_context *cpuctx;

if (!pmu->pmu_cpu_context)
return;

cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
cpuctx->ctx.pmu = pmu;
}

static int __init init_hw_perf_events(void)
{
struct x86_pmu_quirk *quirk;
Expand Down Expand Up @@ -2195,9 +2181,6 @@ static int __init init_hw_perf_events(void)
(hybrid_pmu->cpu_type == hybrid_big) ? PERF_TYPE_RAW : -1);
if (err)
break;

if (cpu_type == hybrid_pmu->cpu_type)
x86_pmu_update_cpu_context(&hybrid_pmu->pmu, raw_smp_processor_id());
}

if (i < x86_pmu.num_hybrid_pmus) {
Expand Down Expand Up @@ -2646,15 +2629,15 @@ static const struct attribute_group *x86_pmu_attr_groups[] = {
NULL,
};

static void x86_pmu_sched_task(struct perf_event_context *ctx, bool sched_in)
static void x86_pmu_sched_task(struct perf_event_pmu_context *pmu_ctx, bool sched_in)
{
static_call_cond(x86_pmu_sched_task)(ctx, sched_in);
static_call_cond(x86_pmu_sched_task)(pmu_ctx, sched_in);
}

static void x86_pmu_swap_task_ctx(struct perf_event_context *prev,
struct perf_event_context *next)
static void x86_pmu_swap_task_ctx(struct perf_event_pmu_context *prev_epc,
struct perf_event_pmu_context *next_epc)
{
static_call_cond(x86_pmu_swap_task_ctx)(prev, next);
static_call_cond(x86_pmu_swap_task_ctx)(prev_epc, next_epc);
}

void perf_check_microcode(void)
Expand Down Expand Up @@ -2689,12 +2672,13 @@ static int x86_pmu_aux_output_match(struct perf_event *event)
return 0;
}

static int x86_pmu_filter_match(struct perf_event *event)
static bool x86_pmu_filter(struct pmu *pmu, int cpu)
{
if (x86_pmu.filter_match)
return x86_pmu.filter_match(event);
bool ret = false;

return 1;
static_call_cond(x86_pmu_filter)(pmu, cpu, &ret);

return ret;
}

static struct pmu pmu = {
Expand Down Expand Up @@ -2725,7 +2709,7 @@ static struct pmu pmu = {

.aux_output_match = x86_pmu_aux_output_match,

.filter_match = x86_pmu_filter_match,
.filter = x86_pmu_filter,
};

void arch_perf_update_userpage(struct perf_event *event,
Expand Down
23 changes: 10 additions & 13 deletions arch/x86/events/intel/core.c
Original file line number Diff line number Diff line change
Expand Up @@ -4536,8 +4536,6 @@ static bool init_hybrid_pmu(int cpu)
cpumask_set_cpu(cpu, &pmu->supported_cpus);
cpuc->pmu = &pmu->pmu;

x86_pmu_update_cpu_context(&pmu->pmu, cpu);

return true;
}

Expand Down Expand Up @@ -4671,17 +4669,17 @@ static void intel_pmu_cpu_dead(int cpu)
cpumask_clear_cpu(cpu, &hybrid_pmu(cpuc->pmu)->supported_cpus);
}

static void intel_pmu_sched_task(struct perf_event_context *ctx,
static void intel_pmu_sched_task(struct perf_event_pmu_context *pmu_ctx,
bool sched_in)
{
intel_pmu_pebs_sched_task(ctx, sched_in);
intel_pmu_lbr_sched_task(ctx, sched_in);
intel_pmu_pebs_sched_task(pmu_ctx, sched_in);
intel_pmu_lbr_sched_task(pmu_ctx, sched_in);
}

static void intel_pmu_swap_task_ctx(struct perf_event_context *prev,
struct perf_event_context *next)
static void intel_pmu_swap_task_ctx(struct perf_event_pmu_context *prev_epc,
struct perf_event_pmu_context *next_epc)
{
intel_pmu_lbr_swap_task_ctx(prev, next);
intel_pmu_lbr_swap_task_ctx(prev_epc, next_epc);
}

static int intel_pmu_check_period(struct perf_event *event, u64 value)
Expand All @@ -4705,12 +4703,11 @@ static int intel_pmu_aux_output_match(struct perf_event *event)
return is_intel_pt_event(event);
}

static int intel_pmu_filter_match(struct perf_event *event)
static void intel_pmu_filter(struct pmu *pmu, int cpu, bool *ret)
{
struct x86_hybrid_pmu *pmu = hybrid_pmu(event->pmu);
unsigned int cpu = smp_processor_id();
struct x86_hybrid_pmu *hpmu = hybrid_pmu(pmu);

return cpumask_test_cpu(cpu, &pmu->supported_cpus);
*ret = !cpumask_test_cpu(cpu, &hpmu->supported_cpus);
}

PMU_FORMAT_ATTR(offcore_rsp, "config1:0-63");
Expand Down Expand Up @@ -6412,7 +6409,7 @@ __init int intel_pmu_init(void)
static_call_update(intel_pmu_set_topdown_event_period,
&adl_set_topdown_event_period);

x86_pmu.filter_match = intel_pmu_filter_match;
x86_pmu.filter = intel_pmu_filter;
x86_pmu.get_event_constraints = adl_get_event_constraints;
x86_pmu.hw_config = adl_hw_config;
x86_pmu.limit_period = spr_limit_period;
Expand Down
4 changes: 2 additions & 2 deletions arch/x86/events/intel/ds.c
Original file line number Diff line number Diff line change
Expand Up @@ -1059,7 +1059,7 @@ static inline bool pebs_needs_sched_cb(struct cpu_hw_events *cpuc)
return cpuc->n_pebs && (cpuc->n_pebs == cpuc->n_large_pebs);
}

void intel_pmu_pebs_sched_task(struct perf_event_context *ctx, bool sched_in)
void intel_pmu_pebs_sched_task(struct perf_event_pmu_context *pmu_ctx, bool sched_in)
{
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);

Expand Down Expand Up @@ -1167,7 +1167,7 @@ static void
pebs_update_state(bool needed_cb, struct cpu_hw_events *cpuc,
struct perf_event *event, bool add)
{
struct pmu *pmu = event->ctx->pmu;
struct pmu *pmu = event->pmu;
/*
* Make sure we get updated with the first PEBS
* event. It will trigger also during removal, but
Expand Down
30 changes: 15 additions & 15 deletions arch/x86/events/intel/lbr.c
Original file line number Diff line number Diff line change
Expand Up @@ -515,21 +515,21 @@ static void __intel_pmu_lbr_save(void *ctx)
cpuc->last_log_id = ++task_context_opt(ctx)->log_id;
}

void intel_pmu_lbr_swap_task_ctx(struct perf_event_context *prev,
struct perf_event_context *next)
void intel_pmu_lbr_swap_task_ctx(struct perf_event_pmu_context *prev_epc,
struct perf_event_pmu_context *next_epc)
{
void *prev_ctx_data, *next_ctx_data;

swap(prev->task_ctx_data, next->task_ctx_data);
swap(prev_epc->task_ctx_data, next_epc->task_ctx_data);

/*
* Architecture specific synchronization makes sense in
* case both prev->task_ctx_data and next->task_ctx_data
* Architecture specific synchronization makes sense in case
* both prev_epc->task_ctx_data and next_epc->task_ctx_data
* pointers are allocated.
*/

prev_ctx_data = next->task_ctx_data;
next_ctx_data = prev->task_ctx_data;
prev_ctx_data = next_epc->task_ctx_data;
next_ctx_data = prev_epc->task_ctx_data;

if (!prev_ctx_data || !next_ctx_data)
return;
Expand All @@ -538,7 +538,7 @@ void intel_pmu_lbr_swap_task_ctx(struct perf_event_context *prev,
task_context_opt(next_ctx_data)->lbr_callstack_users);
}

void intel_pmu_lbr_sched_task(struct perf_event_context *ctx, bool sched_in)
void intel_pmu_lbr_sched_task(struct perf_event_pmu_context *pmu_ctx, bool sched_in)
{
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
void *task_ctx;
Expand All @@ -551,7 +551,7 @@ void intel_pmu_lbr_sched_task(struct perf_event_context *ctx, bool sched_in)
* the task was scheduled out, restore the stack. Otherwise flush
* the LBR stack.
*/
task_ctx = ctx ? ctx->task_ctx_data : NULL;
task_ctx = pmu_ctx ? pmu_ctx->task_ctx_data : NULL;
if (task_ctx) {
if (sched_in)
__intel_pmu_lbr_restore(task_ctx);
Expand Down Expand Up @@ -587,8 +587,8 @@ void intel_pmu_lbr_add(struct perf_event *event)

cpuc->br_sel = event->hw.branch_reg.reg;

if (branch_user_callstack(cpuc->br_sel) && event->ctx->task_ctx_data)
task_context_opt(event->ctx->task_ctx_data)->lbr_callstack_users++;
if (branch_user_callstack(cpuc->br_sel) && event->pmu_ctx->task_ctx_data)
task_context_opt(event->pmu_ctx->task_ctx_data)->lbr_callstack_users++;

/*
* Request pmu::sched_task() callback, which will fire inside the
Expand All @@ -611,7 +611,7 @@ void intel_pmu_lbr_add(struct perf_event *event)
*/
if (x86_pmu.intel_cap.pebs_baseline && event->attr.precise_ip > 0)
cpuc->lbr_pebs_users++;
perf_sched_cb_inc(event->ctx->pmu);
perf_sched_cb_inc(event->pmu);
if (!cpuc->lbr_users++ && !event->total_time_running)
intel_pmu_lbr_reset();
}
Expand Down Expand Up @@ -664,8 +664,8 @@ void intel_pmu_lbr_del(struct perf_event *event)
return;

if (branch_user_callstack(cpuc->br_sel) &&
event->ctx->task_ctx_data)
task_context_opt(event->ctx->task_ctx_data)->lbr_callstack_users--;
event->pmu_ctx->task_ctx_data)
task_context_opt(event->pmu_ctx->task_ctx_data)->lbr_callstack_users--;

if (event->hw.flags & PERF_X86_EVENT_LBR_SELECT)
cpuc->lbr_select = 0;
Expand All @@ -675,7 +675,7 @@ void intel_pmu_lbr_del(struct perf_event *event)
cpuc->lbr_users--;
WARN_ON_ONCE(cpuc->lbr_users < 0);
WARN_ON_ONCE(cpuc->lbr_pebs_users < 0);
perf_sched_cb_dec(event->ctx->pmu);
perf_sched_cb_dec(event->pmu);
}

static inline bool vlbr_exclude_host(void)
Expand Down
Loading

0 comments on commit bd27568

Please sign in to comment.