Skip to content

Commit

Permalink
Merge branch 'harden-and-extend-elf-build-id-parsing-logic'
Browse files Browse the repository at this point in the history
Andrii Nakryiko says:

====================
Harden and extend ELF build ID parsing logic

The goal of this patch set is to extend existing ELF build ID parsing logic,
currently mostly used by BPF subsystem, with support for working in sleepable
mode in which memory faults are allowed and can be relied upon to fetch
relevant parts of ELF file to find and fetch .note.gnu.build-id information.

This is useful and important for BPF subsystem itself, but also for
PROCMAP_QUERY ioctl(), built atop of /proc/<pid>/maps functionality (see [0]),
which makes use of the same build_id_parse() functionality. PROCMAP_QUERY is
always called from sleepable user process context, so it doesn't have to
suffer from current restrictions of build_id_parse() which are due to the NMI
context assumption.

Along the way, we harden the logic to avoid TOCTOU, overflow, out-of-bounds
access problems.  This is the very first patch, which can be backported to
older releases, if necessary.

We also lift existing limitations of only working as long as ELF program
headers and build ID note section is contained strictly within the very first
page of ELF file.

We achieve all of the above without duplication of logic between sleepable and
non-sleepable modes through freader abstraction that manages underlying folio
from page cache (on demand) and gives a simple to use direct memory access
interface. With that, single page restrictions and adding sleepable mode
support is rather straightforward.

We also extend existing set of BPF selftests with a few tests targeting build
ID logic across sleepable and non-sleepabe contexts (we utilize sleepable and
non-sleepable uprobes for that).

   [0] https://lore.kernel.org/linux-mm/20240627170900.1672542-4-andrii@kernel.org/

v6->v7:
  - added filemap_invalidate_{lock,unlock}_shared() around read_cache_folio
    and kept Eduard's Reviewed-by (Eduard);
v5->v6:
  - use local phnum variable in get_build_id_32() (Jann);
  - switch memcmp() instead of strcmp() in parse_build_id() (Jann);
v4->v5:
  - pass proper file reference to read_cache_folio() (Shakeel);
  - fix another potential overflow due to two u32 additions (Andi);
  - add PageUptodate() check to patch openbmc#1 (Jann);
v3->v4:
  - fix few more potential overflow and out-of-bounds access issues (Andi);
  - use purely folio-based implementation for freader (Matthew);
v2->v3:
  - remove unneeded READ_ONCE()s and force phoff to u64 for 32-bit mode (Andi);
  - moved hardening fixes to the front for easier backporting (Jann);
  - call freader_cleanup() from build_id_parse_buf() for consistency (Jiri);
v1->v2:
  - ensure MADV_PAGEOUT works reliably by paging data in first (Shakeel);
  - to fix BPF CI build optionally define MADV_POPULATE_READ in selftest.
====================

Link: https://lore.kernel.org/r/20240829174232.3133883-1-andrii@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
  • Loading branch information
Alexei Starovoitov committed Sep 11, 2024
2 parents 58ff04e + 3c217a1 commit f765274
Show file tree
Hide file tree
Showing 11 changed files with 605 additions and 142 deletions.
2 changes: 2 additions & 0 deletions include/linux/bpf.h
Original file line number Diff line number Diff line change
Expand Up @@ -3200,7 +3200,9 @@ extern const struct bpf_func_proto bpf_get_current_uid_gid_proto;
extern const struct bpf_func_proto bpf_get_current_comm_proto;
extern const struct bpf_func_proto bpf_get_stackid_proto;
extern const struct bpf_func_proto bpf_get_stack_proto;
extern const struct bpf_func_proto bpf_get_stack_sleepable_proto;
extern const struct bpf_func_proto bpf_get_task_stack_proto;
extern const struct bpf_func_proto bpf_get_task_stack_sleepable_proto;
extern const struct bpf_func_proto bpf_get_stackid_proto_pe;
extern const struct bpf_func_proto bpf_get_stack_proto_pe;
extern const struct bpf_func_proto bpf_sock_map_update_proto;
Expand Down
4 changes: 2 additions & 2 deletions include/linux/buildid.h
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,8 @@
#define BUILD_ID_SIZE_MAX 20

struct vm_area_struct;
int build_id_parse(struct vm_area_struct *vma, unsigned char *build_id,
__u32 *size);
int build_id_parse(struct vm_area_struct *vma, unsigned char *build_id, __u32 *size);
int build_id_parse_nofault(struct vm_area_struct *vma, unsigned char *build_id, __u32 *size);
int build_id_parse_buf(const void *buf, unsigned char *build_id, u32 buf_size);

#if IS_ENABLED(CONFIG_STACKTRACE_BUILD_ID) || IS_ENABLED(CONFIG_VMCORE_INFO)
Expand Down
131 changes: 101 additions & 30 deletions kernel/bpf/stackmap.c
Original file line number Diff line number Diff line change
Expand Up @@ -124,8 +124,24 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr)
return ERR_PTR(err);
}

static int fetch_build_id(struct vm_area_struct *vma, unsigned char *build_id, bool may_fault)
{
return may_fault ? build_id_parse(vma, build_id, NULL)
: build_id_parse_nofault(vma, build_id, NULL);
}

/*
* Expects all id_offs[i].ip values to be set to correct initial IPs.
* They will be subsequently:
* - either adjusted in place to a file offset, if build ID fetching
* succeeds; in this case id_offs[i].build_id is set to correct build ID,
* and id_offs[i].status is set to BPF_STACK_BUILD_ID_VALID;
* - or IP will be kept intact, if build ID fetching failed; in this case
* id_offs[i].build_id is zeroed out and id_offs[i].status is set to
* BPF_STACK_BUILD_ID_IP.
*/
static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs,
u64 *ips, u32 trace_nr, bool user)
u32 trace_nr, bool user, bool may_fault)
{
int i;
struct mmap_unlock_irq_work *work = NULL;
Expand All @@ -142,30 +158,28 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs,
/* cannot access current->mm, fall back to ips */
for (i = 0; i < trace_nr; i++) {
id_offs[i].status = BPF_STACK_BUILD_ID_IP;
id_offs[i].ip = ips[i];
memset(id_offs[i].build_id, 0, BUILD_ID_SIZE_MAX);
}
return;
}

for (i = 0; i < trace_nr; i++) {
if (range_in_vma(prev_vma, ips[i], ips[i])) {
u64 ip = READ_ONCE(id_offs[i].ip);

if (range_in_vma(prev_vma, ip, ip)) {
vma = prev_vma;
memcpy(id_offs[i].build_id, prev_build_id,
BUILD_ID_SIZE_MAX);
memcpy(id_offs[i].build_id, prev_build_id, BUILD_ID_SIZE_MAX);
goto build_id_valid;
}
vma = find_vma(current->mm, ips[i]);
if (!vma || build_id_parse(vma, id_offs[i].build_id, NULL)) {
vma = find_vma(current->mm, ip);
if (!vma || fetch_build_id(vma, id_offs[i].build_id, may_fault)) {
/* per entry fall back to ips */
id_offs[i].status = BPF_STACK_BUILD_ID_IP;
id_offs[i].ip = ips[i];
memset(id_offs[i].build_id, 0, BUILD_ID_SIZE_MAX);
continue;
}
build_id_valid:
id_offs[i].offset = (vma->vm_pgoff << PAGE_SHIFT) + ips[i]
- vma->vm_start;
id_offs[i].offset = (vma->vm_pgoff << PAGE_SHIFT) + ip - vma->vm_start;
id_offs[i].status = BPF_STACK_BUILD_ID_VALID;
prev_vma = vma;
prev_build_id = id_offs[i].build_id;
Expand Down Expand Up @@ -216,7 +230,7 @@ static long __bpf_get_stackid(struct bpf_map *map,
struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map);
struct stack_map_bucket *bucket, *new_bucket, *old_bucket;
u32 skip = flags & BPF_F_SKIP_FIELD_MASK;
u32 hash, id, trace_nr, trace_len;
u32 hash, id, trace_nr, trace_len, i;
bool user = flags & BPF_F_USER_STACK;
u64 *ips;
bool hash_matches;
Expand All @@ -238,15 +252,18 @@ static long __bpf_get_stackid(struct bpf_map *map,
return id;

if (stack_map_use_build_id(map)) {
struct bpf_stack_build_id *id_offs;

/* for build_id+offset, pop a bucket before slow cmp */
new_bucket = (struct stack_map_bucket *)
pcpu_freelist_pop(&smap->freelist);
if (unlikely(!new_bucket))
return -ENOMEM;
new_bucket->nr = trace_nr;
stack_map_get_build_id_offset(
(struct bpf_stack_build_id *)new_bucket->data,
ips, trace_nr, user);
id_offs = (struct bpf_stack_build_id *)new_bucket->data;
for (i = 0; i < trace_nr; i++)
id_offs[i].ip = ips[i];
stack_map_get_build_id_offset(id_offs, trace_nr, user, false /* !may_fault */);
trace_len = trace_nr * sizeof(struct bpf_stack_build_id);
if (hash_matches && bucket->nr == trace_nr &&
memcmp(bucket->data, new_bucket->data, trace_len) == 0) {
Expand Down Expand Up @@ -387,7 +404,7 @@ const struct bpf_func_proto bpf_get_stackid_proto_pe = {

static long __bpf_get_stack(struct pt_regs *regs, struct task_struct *task,
struct perf_callchain_entry *trace_in,
void *buf, u32 size, u64 flags)
void *buf, u32 size, u64 flags, bool may_fault)
{
u32 trace_nr, copy_len, elem_size, num_elem, max_depth;
bool user_build_id = flags & BPF_F_USER_BUILD_ID;
Expand All @@ -405,8 +422,7 @@ static long __bpf_get_stack(struct pt_regs *regs, struct task_struct *task,
if (kernel && user_build_id)
goto clear;

elem_size = (user && user_build_id) ? sizeof(struct bpf_stack_build_id)
: sizeof(u64);
elem_size = user_build_id ? sizeof(struct bpf_stack_build_id) : sizeof(u64);
if (unlikely(size % elem_size))
goto clear;

Expand All @@ -427,28 +443,44 @@ static long __bpf_get_stack(struct pt_regs *regs, struct task_struct *task,
if (sysctl_perf_event_max_stack < max_depth)
max_depth = sysctl_perf_event_max_stack;

if (may_fault)
rcu_read_lock(); /* need RCU for perf's callchain below */

if (trace_in)
trace = trace_in;
else if (kernel && task)
trace = get_callchain_entry_for_task(task, max_depth);
else
trace = get_perf_callchain(regs, 0, kernel, user, max_depth,
crosstask, false);
if (unlikely(!trace))
goto err_fault;

if (trace->nr < skip)
if (unlikely(!trace) || trace->nr < skip) {
if (may_fault)
rcu_read_unlock();
goto err_fault;
}

trace_nr = trace->nr - skip;
trace_nr = (trace_nr <= num_elem) ? trace_nr : num_elem;
copy_len = trace_nr * elem_size;

ips = trace->ip + skip;
if (user && user_build_id)
stack_map_get_build_id_offset(buf, ips, trace_nr, user);
else
if (user_build_id) {
struct bpf_stack_build_id *id_offs = buf;
u32 i;

for (i = 0; i < trace_nr; i++)
id_offs[i].ip = ips[i];
} else {
memcpy(buf, ips, copy_len);
}

/* trace/ips should not be dereferenced after this point */
if (may_fault)
rcu_read_unlock();

if (user_build_id)
stack_map_get_build_id_offset(buf, trace_nr, user, may_fault);

if (size > copy_len)
memset(buf + copy_len, 0, size - copy_len);
Expand All @@ -464,7 +496,7 @@ static long __bpf_get_stack(struct pt_regs *regs, struct task_struct *task,
BPF_CALL_4(bpf_get_stack, struct pt_regs *, regs, void *, buf, u32, size,
u64, flags)
{
return __bpf_get_stack(regs, NULL, NULL, buf, size, flags);
return __bpf_get_stack(regs, NULL, NULL, buf, size, flags, false /* !may_fault */);
}

const struct bpf_func_proto bpf_get_stack_proto = {
Expand All @@ -477,8 +509,24 @@ const struct bpf_func_proto bpf_get_stack_proto = {
.arg4_type = ARG_ANYTHING,
};

BPF_CALL_4(bpf_get_task_stack, struct task_struct *, task, void *, buf,
u32, size, u64, flags)
BPF_CALL_4(bpf_get_stack_sleepable, struct pt_regs *, regs, void *, buf, u32, size,
u64, flags)
{
return __bpf_get_stack(regs, NULL, NULL, buf, size, flags, true /* may_fault */);
}

const struct bpf_func_proto bpf_get_stack_sleepable_proto = {
.func = bpf_get_stack_sleepable,
.gpl_only = true,
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_CTX,
.arg2_type = ARG_PTR_TO_UNINIT_MEM,
.arg3_type = ARG_CONST_SIZE_OR_ZERO,
.arg4_type = ARG_ANYTHING,
};

static long __bpf_get_task_stack(struct task_struct *task, void *buf, u32 size,
u64 flags, bool may_fault)
{
struct pt_regs *regs;
long res = -EINVAL;
Expand All @@ -488,12 +536,18 @@ BPF_CALL_4(bpf_get_task_stack, struct task_struct *, task, void *, buf,

regs = task_pt_regs(task);
if (regs)
res = __bpf_get_stack(regs, task, NULL, buf, size, flags);
res = __bpf_get_stack(regs, task, NULL, buf, size, flags, may_fault);
put_task_stack(task);

return res;
}

BPF_CALL_4(bpf_get_task_stack, struct task_struct *, task, void *, buf,
u32, size, u64, flags)
{
return __bpf_get_task_stack(task, buf, size, flags, false /* !may_fault */);
}

const struct bpf_func_proto bpf_get_task_stack_proto = {
.func = bpf_get_task_stack,
.gpl_only = false,
Expand All @@ -505,6 +559,23 @@ const struct bpf_func_proto bpf_get_task_stack_proto = {
.arg4_type = ARG_ANYTHING,
};

BPF_CALL_4(bpf_get_task_stack_sleepable, struct task_struct *, task, void *, buf,
u32, size, u64, flags)
{
return __bpf_get_task_stack(task, buf, size, flags, true /* !may_fault */);
}

const struct bpf_func_proto bpf_get_task_stack_sleepable_proto = {
.func = bpf_get_task_stack_sleepable,
.gpl_only = false,
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_BTF_ID,
.arg1_btf_id = &btf_tracing_ids[BTF_TRACING_TYPE_TASK],
.arg2_type = ARG_PTR_TO_UNINIT_MEM,
.arg3_type = ARG_CONST_SIZE_OR_ZERO,
.arg4_type = ARG_ANYTHING,
};

BPF_CALL_4(bpf_get_stack_pe, struct bpf_perf_event_data_kern *, ctx,
void *, buf, u32, size, u64, flags)
{
Expand All @@ -516,7 +587,7 @@ BPF_CALL_4(bpf_get_stack_pe, struct bpf_perf_event_data_kern *, ctx,
__u64 nr_kernel;

if (!(event->attr.sample_type & PERF_SAMPLE_CALLCHAIN))
return __bpf_get_stack(regs, NULL, NULL, buf, size, flags);
return __bpf_get_stack(regs, NULL, NULL, buf, size, flags, false /* !may_fault */);

if (unlikely(flags & ~(BPF_F_SKIP_FIELD_MASK | BPF_F_USER_STACK |
BPF_F_USER_BUILD_ID)))
Expand All @@ -536,7 +607,7 @@ BPF_CALL_4(bpf_get_stack_pe, struct bpf_perf_event_data_kern *, ctx,
__u64 nr = trace->nr;

trace->nr = nr_kernel;
err = __bpf_get_stack(regs, NULL, trace, buf, size, flags);
err = __bpf_get_stack(regs, NULL, trace, buf, size, flags, false /* !may_fault */);

/* restore nr */
trace->nr = nr;
Expand All @@ -548,7 +619,7 @@ BPF_CALL_4(bpf_get_stack_pe, struct bpf_perf_event_data_kern *, ctx,
goto clear;

flags = (flags & ~BPF_F_SKIP_FIELD_MASK) | skip;
err = __bpf_get_stack(regs, NULL, trace, buf, size, flags);
err = __bpf_get_stack(regs, NULL, trace, buf, size, flags, false /* !may_fault */);
}
return err;

Expand Down
2 changes: 1 addition & 1 deletion kernel/events/core.c
Original file line number Diff line number Diff line change
Expand Up @@ -8851,7 +8851,7 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size;

if (atomic_read(&nr_build_id_events))
build_id_parse(vma, mmap_event->build_id, &mmap_event->build_id_size);
build_id_parse_nofault(vma, mmap_event->build_id, &mmap_event->build_id_size);

perf_iterate_sb(perf_event_mmap_output,
mmap_event,
Expand Down
5 changes: 3 additions & 2 deletions kernel/trace/bpf_trace.c
Original file line number Diff line number Diff line change
Expand Up @@ -1507,7 +1507,8 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
case BPF_FUNC_jiffies64:
return &bpf_jiffies64_proto;
case BPF_FUNC_get_task_stack:
return &bpf_get_task_stack_proto;
return prog->sleepable ? &bpf_get_task_stack_sleepable_proto
: &bpf_get_task_stack_proto;
case BPF_FUNC_copy_from_user:
return &bpf_copy_from_user_proto;
case BPF_FUNC_copy_from_user_task:
Expand Down Expand Up @@ -1563,7 +1564,7 @@ kprobe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
case BPF_FUNC_get_stackid:
return &bpf_get_stackid_proto;
case BPF_FUNC_get_stack:
return &bpf_get_stack_proto;
return prog->sleepable ? &bpf_get_stack_sleepable_proto : &bpf_get_stack_proto;
#ifdef CONFIG_BPF_KPROBE_OVERRIDE
case BPF_FUNC_override_return:
return &bpf_override_return_proto;
Expand Down
Loading

0 comments on commit f765274

Please sign in to comment.