Skip to content

Commit

Permalink
bpf: Move rcu lock management out of BPF_PROG_RUN routines
Browse files Browse the repository at this point in the history
Commit 7d08c2c ("bpf: Refactor BPF_PROG_RUN_ARRAY family of macros
into functions") switched a bunch of BPF_PROG_RUN macros to inline
routines. This changed the semantic a bit. Due to arguments expansion
of macros, it used to be:

	rcu_read_lock();
	array = rcu_dereference(cgrp->bpf.effective[atype]);
	...

Now, with with inline routines, we have:
	array_rcu = rcu_dereference(cgrp->bpf.effective[atype]);
	/* array_rcu can be kfree'd here */
	rcu_read_lock();
	array = rcu_dereference(array_rcu);

I'm assuming in practice rcu subsystem isn't fast enough to trigger
this but let's use rcu API properly.

Also, rename to lower caps to not confuse with macros. Additionally,
drop and expand BPF_PROG_CGROUP_INET_EGRESS_RUN_ARRAY.

See [1] for more context.

  [1] https://lore.kernel.org/bpf/CAKH8qBs60fOinFdxiiQikK_q0EcVxGvNTQoWvHLEUGbgcj1UYg@mail.gmail.com/T/#u

v2
- keep rcu locks inside by passing cgroup_bpf

Fixes: 7d08c2c ("bpf: Refactor BPF_PROG_RUN_ARRAY family of macros into functions")
Signed-off-by: Stanislav Fomichev <sdf@google.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Link: https://lore.kernel.org/bpf/20220414161233.170780-1-sdf@google.com
  • Loading branch information
fomichev authored and Alexei Starovoitov committed Apr 19, 2022
1 parent 2324257 commit 055eb95
Show file tree
Hide file tree
Showing 4 changed files with 124 additions and 128 deletions.
8 changes: 6 additions & 2 deletions drivers/media/rc/bpf-lirc.c
Original file line number Diff line number Diff line change
Expand Up @@ -216,8 +216,12 @@ void lirc_bpf_run(struct rc_dev *rcdev, u32 sample)

raw->bpf_sample = sample;

if (raw->progs)
BPF_PROG_RUN_ARRAY(raw->progs, &raw->bpf_sample, bpf_prog_run);
if (raw->progs) {
rcu_read_lock();
bpf_prog_run_array(rcu_dereference(raw->progs),
&raw->bpf_sample, bpf_prog_run);
rcu_read_unlock();
}
}

/*
Expand Down
115 changes: 7 additions & 108 deletions include/linux/bpf.h
Original file line number Diff line number Diff line change
Expand Up @@ -1221,7 +1221,7 @@ u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size,
/* an array of programs to be executed under rcu_lock.
*
* Typical usage:
* ret = BPF_PROG_RUN_ARRAY(&bpf_prog_array, ctx, bpf_prog_run);
* ret = bpf_prog_run_array(rcu_dereference(&bpf_prog_array), ctx, bpf_prog_run);
*
* the structure returned by bpf_prog_array_alloc() should be populated
* with program pointers and the last pointer must be NULL.
Expand Down Expand Up @@ -1315,83 +1315,22 @@ static inline void bpf_reset_run_ctx(struct bpf_run_ctx *old_ctx)

typedef u32 (*bpf_prog_run_fn)(const struct bpf_prog *prog, const void *ctx);

static __always_inline int
BPF_PROG_RUN_ARRAY_CG_FLAGS(const struct bpf_prog_array __rcu *array_rcu,
const void *ctx, bpf_prog_run_fn run_prog,
int retval, u32 *ret_flags)
{
const struct bpf_prog_array_item *item;
const struct bpf_prog *prog;
const struct bpf_prog_array *array;
struct bpf_run_ctx *old_run_ctx;
struct bpf_cg_run_ctx run_ctx;
u32 func_ret;

run_ctx.retval = retval;
migrate_disable();
rcu_read_lock();
array = rcu_dereference(array_rcu);
item = &array->items[0];
old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx);
while ((prog = READ_ONCE(item->prog))) {
run_ctx.prog_item = item;
func_ret = run_prog(prog, ctx);
if (!(func_ret & 1) && !IS_ERR_VALUE((long)run_ctx.retval))
run_ctx.retval = -EPERM;
*(ret_flags) |= (func_ret >> 1);
item++;
}
bpf_reset_run_ctx(old_run_ctx);
rcu_read_unlock();
migrate_enable();
return run_ctx.retval;
}

static __always_inline int
BPF_PROG_RUN_ARRAY_CG(const struct bpf_prog_array __rcu *array_rcu,
const void *ctx, bpf_prog_run_fn run_prog,
int retval)
{
const struct bpf_prog_array_item *item;
const struct bpf_prog *prog;
const struct bpf_prog_array *array;
struct bpf_run_ctx *old_run_ctx;
struct bpf_cg_run_ctx run_ctx;

run_ctx.retval = retval;
migrate_disable();
rcu_read_lock();
array = rcu_dereference(array_rcu);
item = &array->items[0];
old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx);
while ((prog = READ_ONCE(item->prog))) {
run_ctx.prog_item = item;
if (!run_prog(prog, ctx) && !IS_ERR_VALUE((long)run_ctx.retval))
run_ctx.retval = -EPERM;
item++;
}
bpf_reset_run_ctx(old_run_ctx);
rcu_read_unlock();
migrate_enable();
return run_ctx.retval;
}

static __always_inline u32
BPF_PROG_RUN_ARRAY(const struct bpf_prog_array __rcu *array_rcu,
bpf_prog_run_array(const struct bpf_prog_array *array,
const void *ctx, bpf_prog_run_fn run_prog)
{
const struct bpf_prog_array_item *item;
const struct bpf_prog *prog;
const struct bpf_prog_array *array;
struct bpf_run_ctx *old_run_ctx;
struct bpf_trace_run_ctx run_ctx;
u32 ret = 1;

migrate_disable();
rcu_read_lock();
array = rcu_dereference(array_rcu);
RCU_LOCKDEP_WARN(!rcu_read_lock_held(), "no rcu lock held");

if (unlikely(!array))
goto out;
return ret;

migrate_disable();
old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx);
item = &array->items[0];
while ((prog = READ_ONCE(item->prog))) {
Expand All @@ -1400,50 +1339,10 @@ BPF_PROG_RUN_ARRAY(const struct bpf_prog_array __rcu *array_rcu,
item++;
}
bpf_reset_run_ctx(old_run_ctx);
out:
rcu_read_unlock();
migrate_enable();
return ret;
}

/* To be used by __cgroup_bpf_run_filter_skb for EGRESS BPF progs
* so BPF programs can request cwr for TCP packets.
*
* Current cgroup skb programs can only return 0 or 1 (0 to drop the
* packet. This macro changes the behavior so the low order bit
* indicates whether the packet should be dropped (0) or not (1)
* and the next bit is a congestion notification bit. This could be
* used by TCP to call tcp_enter_cwr()
*
* Hence, new allowed return values of CGROUP EGRESS BPF programs are:
* 0: drop packet
* 1: keep packet
* 2: drop packet and cn
* 3: keep packet and cn
*
* This macro then converts it to one of the NET_XMIT or an error
* code that is then interpreted as drop packet (and no cn):
* 0: NET_XMIT_SUCCESS skb should be transmitted
* 1: NET_XMIT_DROP skb should be dropped and cn
* 2: NET_XMIT_CN skb should be transmitted and cn
* 3: -err skb should be dropped
*/
#define BPF_PROG_CGROUP_INET_EGRESS_RUN_ARRAY(array, ctx, func) \
({ \
u32 _flags = 0; \
bool _cn; \
u32 _ret; \
_ret = BPF_PROG_RUN_ARRAY_CG_FLAGS(array, ctx, func, 0, &_flags); \
_cn = _flags & BPF_RET_SET_CN; \
if (_ret && !IS_ERR_VALUE((long)_ret)) \
_ret = -EFAULT; \
if (!_ret) \
_ret = (_cn ? NET_XMIT_CN : NET_XMIT_SUCCESS); \
else \
_ret = (_cn ? NET_XMIT_DROP : _ret); \
_ret; \
})

#ifdef CONFIG_BPF_SYSCALL
DECLARE_PER_CPU(int, bpf_prog_active);
extern struct mutex bpf_stats_enabled_mutex;
Expand Down
124 changes: 107 additions & 17 deletions kernel/bpf/cgroup.c
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,72 @@
DEFINE_STATIC_KEY_ARRAY_FALSE(cgroup_bpf_enabled_key, MAX_CGROUP_BPF_ATTACH_TYPE);
EXPORT_SYMBOL(cgroup_bpf_enabled_key);

/* __always_inline is necessary to prevent indirect call through run_prog
* function pointer.
*/
static __always_inline int
bpf_prog_run_array_cg_flags(const struct cgroup_bpf *cgrp,
enum cgroup_bpf_attach_type atype,
const void *ctx, bpf_prog_run_fn run_prog,
int retval, u32 *ret_flags)
{
const struct bpf_prog_array_item *item;
const struct bpf_prog *prog;
const struct bpf_prog_array *array;
struct bpf_run_ctx *old_run_ctx;
struct bpf_cg_run_ctx run_ctx;
u32 func_ret;

run_ctx.retval = retval;
migrate_disable();
rcu_read_lock();
array = rcu_dereference(cgrp->effective[atype]);
item = &array->items[0];
old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx);
while ((prog = READ_ONCE(item->prog))) {
run_ctx.prog_item = item;
func_ret = run_prog(prog, ctx);
if (!(func_ret & 1) && !IS_ERR_VALUE((long)run_ctx.retval))
run_ctx.retval = -EPERM;
*(ret_flags) |= (func_ret >> 1);
item++;
}
bpf_reset_run_ctx(old_run_ctx);
rcu_read_unlock();
migrate_enable();
return run_ctx.retval;
}

static __always_inline int
bpf_prog_run_array_cg(const struct cgroup_bpf *cgrp,
enum cgroup_bpf_attach_type atype,
const void *ctx, bpf_prog_run_fn run_prog,
int retval)
{
const struct bpf_prog_array_item *item;
const struct bpf_prog *prog;
const struct bpf_prog_array *array;
struct bpf_run_ctx *old_run_ctx;
struct bpf_cg_run_ctx run_ctx;

run_ctx.retval = retval;
migrate_disable();
rcu_read_lock();
array = rcu_dereference(cgrp->effective[atype]);
item = &array->items[0];
old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx);
while ((prog = READ_ONCE(item->prog))) {
run_ctx.prog_item = item;
if (!run_prog(prog, ctx) && !IS_ERR_VALUE((long)run_ctx.retval))
run_ctx.retval = -EPERM;
item++;
}
bpf_reset_run_ctx(old_run_ctx);
rcu_read_unlock();
migrate_enable();
return run_ctx.retval;
}

void cgroup_bpf_offline(struct cgroup *cgrp)
{
cgroup_get(cgrp);
Expand Down Expand Up @@ -1075,11 +1141,38 @@ int __cgroup_bpf_run_filter_skb(struct sock *sk,
bpf_compute_and_save_data_end(skb, &saved_data_end);

if (atype == CGROUP_INET_EGRESS) {
ret = BPF_PROG_CGROUP_INET_EGRESS_RUN_ARRAY(
cgrp->bpf.effective[atype], skb, __bpf_prog_run_save_cb);
u32 flags = 0;
bool cn;

ret = bpf_prog_run_array_cg_flags(
&cgrp->bpf, atype,
skb, __bpf_prog_run_save_cb, 0, &flags);

/* Return values of CGROUP EGRESS BPF programs are:
* 0: drop packet
* 1: keep packet
* 2: drop packet and cn
* 3: keep packet and cn
*
* The returned value is then converted to one of the NET_XMIT
* or an error code that is then interpreted as drop packet
* (and no cn):
* 0: NET_XMIT_SUCCESS skb should be transmitted
* 1: NET_XMIT_DROP skb should be dropped and cn
* 2: NET_XMIT_CN skb should be transmitted and cn
* 3: -err skb should be dropped
*/

cn = flags & BPF_RET_SET_CN;
if (ret && !IS_ERR_VALUE((long)ret))
ret = -EFAULT;
if (!ret)
ret = (cn ? NET_XMIT_CN : NET_XMIT_SUCCESS);
else
ret = (cn ? NET_XMIT_DROP : ret);
} else {
ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[atype], skb,
__bpf_prog_run_save_cb, 0);
ret = bpf_prog_run_array_cg(&cgrp->bpf, atype,
skb, __bpf_prog_run_save_cb, 0);
if (ret && !IS_ERR_VALUE((long)ret))
ret = -EFAULT;
}
Expand Down Expand Up @@ -1109,8 +1202,7 @@ int __cgroup_bpf_run_filter_sk(struct sock *sk,
{
struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);

return BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[atype], sk,
bpf_prog_run, 0);
return bpf_prog_run_array_cg(&cgrp->bpf, atype, sk, bpf_prog_run, 0);
}
EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk);

Expand Down Expand Up @@ -1155,8 +1247,8 @@ int __cgroup_bpf_run_filter_sock_addr(struct sock *sk,
}

cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
return BPF_PROG_RUN_ARRAY_CG_FLAGS(cgrp->bpf.effective[atype], &ctx,
bpf_prog_run, 0, flags);
return bpf_prog_run_array_cg_flags(&cgrp->bpf, atype,
&ctx, bpf_prog_run, 0, flags);
}
EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_addr);

Expand All @@ -1182,8 +1274,8 @@ int __cgroup_bpf_run_filter_sock_ops(struct sock *sk,
{
struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);

return BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[atype], sock_ops,
bpf_prog_run, 0);
return bpf_prog_run_array_cg(&cgrp->bpf, atype, sock_ops, bpf_prog_run,
0);
}
EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_ops);

Expand All @@ -1200,8 +1292,7 @@ int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor,

rcu_read_lock();
cgrp = task_dfl_cgroup(current);
ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[atype], &ctx,
bpf_prog_run, 0);
ret = bpf_prog_run_array_cg(&cgrp->bpf, atype, &ctx, bpf_prog_run, 0);
rcu_read_unlock();

return ret;
Expand Down Expand Up @@ -1366,8 +1457,7 @@ int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head,

rcu_read_lock();
cgrp = task_dfl_cgroup(current);
ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[atype], &ctx,
bpf_prog_run, 0);
ret = bpf_prog_run_array_cg(&cgrp->bpf, atype, &ctx, bpf_prog_run, 0);
rcu_read_unlock();

kfree(ctx.cur_val);
Expand Down Expand Up @@ -1459,7 +1549,7 @@ int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level,
}

lock_sock(sk);
ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[CGROUP_SETSOCKOPT],
ret = bpf_prog_run_array_cg(&cgrp->bpf, CGROUP_SETSOCKOPT,
&ctx, bpf_prog_run, 0);
release_sock(sk);

Expand Down Expand Up @@ -1559,7 +1649,7 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level,
}

lock_sock(sk);
ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[CGROUP_GETSOCKOPT],
ret = bpf_prog_run_array_cg(&cgrp->bpf, CGROUP_GETSOCKOPT,
&ctx, bpf_prog_run, retval);
release_sock(sk);

Expand Down Expand Up @@ -1608,7 +1698,7 @@ int __cgroup_bpf_run_filter_getsockopt_kern(struct sock *sk, int level,
* be called if that data shouldn't be "exported".
*/

ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[CGROUP_GETSOCKOPT],
ret = bpf_prog_run_array_cg(&cgrp->bpf, CGROUP_GETSOCKOPT,
&ctx, bpf_prog_run, retval);
if (ret < 0)
return ret;
Expand Down
5 changes: 4 additions & 1 deletion kernel/trace/bpf_trace.c
Original file line number Diff line number Diff line change
Expand Up @@ -129,7 +129,10 @@ unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx)
* out of events when it was updated in between this and the
* rcu_dereference() which is accepted risk.
*/
ret = BPF_PROG_RUN_ARRAY(call->prog_array, ctx, bpf_prog_run);
rcu_read_lock();
ret = bpf_prog_run_array(rcu_dereference(call->prog_array),
ctx, bpf_prog_run);
rcu_read_unlock();

out:
__this_cpu_dec(bpf_prog_active);
Expand Down

0 comments on commit 055eb95

Please sign in to comment.