Skip to content

Commit

Permalink
Merge branch 'bpf-tcp-rtt-hook'
Browse files Browse the repository at this point in the history
Stanislav Fomichev says:

====================
Congestion control team would like to have a periodic callback to
track some TCP statistics. Let's add a sock_ops callback that can be
selectively enabled on a socket by socket basis and is executed for
every RTT. BPF program frequency can be further controlled by calling
bpf_ktime_get_ns and bailing out early.

I run neper tcp_stream and tcp_rr tests with the sample program
from the last patch and didn't observe any noticeable performance
difference.

v2:
* add a comment about second accept() in selftest (Yonghong Song)
* refer to tcp_bpf.readme in sample program (Yonghong Song)
====================

Suggested-by: Eric Dumazet <edumazet@google.com>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Priyaranjan Jha <priyarjha@google.com>
Cc: Yuchung Cheng <ycheng@google.com>
Cc: Soheil Hassas Yeganeh <soheil@google.com>
Acked-by: Soheil Hassas Yeganeh <soheil@google.com>
Acked-by: Yuchung Cheng <ycheng@google.com>
Acked-by: Yonghong Song <yhs@fb.com>
Acked-by: Lawrence Brakmo <brakmo@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
  • Loading branch information
borkmann committed Jul 3, 2019
2 parents d2f5bbb + d78e3f0 commit e5a3e25
Show file tree
Hide file tree
Showing 11 changed files with 574 additions and 58 deletions.
8 changes: 8 additions & 0 deletions include/net/tcp.h
Original file line number Diff line number Diff line change
Expand Up @@ -2221,6 +2221,14 @@ static inline bool tcp_bpf_ca_needs_ecn(struct sock *sk)
return (tcp_call_bpf(sk, BPF_SOCK_OPS_NEEDS_ECN, 0, NULL) == 1);
}

static inline void tcp_bpf_rtt(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);

if (BPF_SOCK_OPS_TEST_FLAG(tp, BPF_SOCK_OPS_RTT_CB_FLAG))
tcp_call_bpf(sk, BPF_SOCK_OPS_RTT_CB, 0, NULL);
}

#if IS_ENABLED(CONFIG_SMC)
extern struct static_key_false tcp_have_smc;
#endif
Expand Down
12 changes: 11 additions & 1 deletion include/uapi/linux/bpf.h
Original file line number Diff line number Diff line change
Expand Up @@ -1770,6 +1770,7 @@ union bpf_attr {
* * **BPF_SOCK_OPS_RTO_CB_FLAG** (retransmission time out)
* * **BPF_SOCK_OPS_RETRANS_CB_FLAG** (retransmission)
* * **BPF_SOCK_OPS_STATE_CB_FLAG** (TCP state change)
* * **BPF_SOCK_OPS_RTT_CB_FLAG** (every RTT)
*
* Therefore, this function can be used to clear a callback flag by
* setting the appropriate bit to zero. e.g. to disable the RTO
Expand Down Expand Up @@ -3072,6 +3073,12 @@ struct bpf_tcp_sock {
* sum(delta(snd_una)), or how many bytes
* were acked.
*/
__u32 dsack_dups; /* RFC4898 tcpEStatsStackDSACKDups
* total number of DSACK blocks received
*/
__u32 delivered; /* Total data packets delivered incl. rexmits */
__u32 delivered_ce; /* Like the above but only ECE marked packets */
__u32 icsk_retransmits; /* Number of unrecovered [RTO] timeouts */
};

struct bpf_sock_tuple {
Expand Down Expand Up @@ -3314,7 +3321,8 @@ struct bpf_sock_ops {
#define BPF_SOCK_OPS_RTO_CB_FLAG (1<<0)
#define BPF_SOCK_OPS_RETRANS_CB_FLAG (1<<1)
#define BPF_SOCK_OPS_STATE_CB_FLAG (1<<2)
#define BPF_SOCK_OPS_ALL_CB_FLAGS 0x7 /* Mask of all currently
#define BPF_SOCK_OPS_RTT_CB_FLAG (1<<3)
#define BPF_SOCK_OPS_ALL_CB_FLAGS 0xF /* Mask of all currently
* supported cb flags
*/

Expand Down Expand Up @@ -3369,6 +3377,8 @@ enum {
BPF_SOCK_OPS_TCP_LISTEN_CB, /* Called on listen(2), right after
* socket transition to LISTEN state.
*/
BPF_SOCK_OPS_RTT_CB, /* Called on every RTT.
*/
};

/* List of TCP states. There is a build check in net/ipv4/tcp.c to detect
Expand Down
207 changes: 153 additions & 54 deletions net/core/filter.c
Original file line number Diff line number Diff line change
Expand Up @@ -5194,54 +5194,6 @@ static const struct bpf_func_proto bpf_lwt_seg6_adjust_srh_proto = {
};
#endif /* CONFIG_IPV6_SEG6_BPF */

#define CONVERT_COMMON_TCP_SOCK_FIELDS(md_type, CONVERT) \
do { \
switch (si->off) { \
case offsetof(md_type, snd_cwnd): \
CONVERT(snd_cwnd); break; \
case offsetof(md_type, srtt_us): \
CONVERT(srtt_us); break; \
case offsetof(md_type, snd_ssthresh): \
CONVERT(snd_ssthresh); break; \
case offsetof(md_type, rcv_nxt): \
CONVERT(rcv_nxt); break; \
case offsetof(md_type, snd_nxt): \
CONVERT(snd_nxt); break; \
case offsetof(md_type, snd_una): \
CONVERT(snd_una); break; \
case offsetof(md_type, mss_cache): \
CONVERT(mss_cache); break; \
case offsetof(md_type, ecn_flags): \
CONVERT(ecn_flags); break; \
case offsetof(md_type, rate_delivered): \
CONVERT(rate_delivered); break; \
case offsetof(md_type, rate_interval_us): \
CONVERT(rate_interval_us); break; \
case offsetof(md_type, packets_out): \
CONVERT(packets_out); break; \
case offsetof(md_type, retrans_out): \
CONVERT(retrans_out); break; \
case offsetof(md_type, total_retrans): \
CONVERT(total_retrans); break; \
case offsetof(md_type, segs_in): \
CONVERT(segs_in); break; \
case offsetof(md_type, data_segs_in): \
CONVERT(data_segs_in); break; \
case offsetof(md_type, segs_out): \
CONVERT(segs_out); break; \
case offsetof(md_type, data_segs_out): \
CONVERT(data_segs_out); break; \
case offsetof(md_type, lost_out): \
CONVERT(lost_out); break; \
case offsetof(md_type, sacked_out): \
CONVERT(sacked_out); break; \
case offsetof(md_type, bytes_received): \
CONVERT(bytes_received); break; \
case offsetof(md_type, bytes_acked): \
CONVERT(bytes_acked); break; \
} \
} while (0)

#ifdef CONFIG_INET
static struct sock *sk_lookup(struct net *net, struct bpf_sock_tuple *tuple,
int dif, int sdif, u8 family, u8 proto)
Expand Down Expand Up @@ -5592,7 +5544,8 @@ static const struct bpf_func_proto bpf_sock_addr_sk_lookup_udp_proto = {
bool bpf_tcp_sock_is_valid_access(int off, int size, enum bpf_access_type type,
struct bpf_insn_access_aux *info)
{
if (off < 0 || off >= offsetofend(struct bpf_tcp_sock, bytes_acked))
if (off < 0 || off >= offsetofend(struct bpf_tcp_sock,
icsk_retransmits))
return false;

if (off % size != 0)
Expand Down Expand Up @@ -5623,8 +5576,19 @@ u32 bpf_tcp_sock_convert_ctx_access(enum bpf_access_type type,
offsetof(struct tcp_sock, FIELD)); \
} while (0)

CONVERT_COMMON_TCP_SOCK_FIELDS(struct bpf_tcp_sock,
BPF_TCP_SOCK_GET_COMMON);
#define BPF_INET_SOCK_GET_COMMON(FIELD) \
do { \
BUILD_BUG_ON(FIELD_SIZEOF(struct inet_connection_sock, \
FIELD) > \
FIELD_SIZEOF(struct bpf_tcp_sock, FIELD)); \
*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \
struct inet_connection_sock, \
FIELD), \
si->dst_reg, si->src_reg, \
offsetof( \
struct inet_connection_sock, \
FIELD)); \
} while (0)

if (insn > insn_buf)
return insn - insn_buf;
Expand All @@ -5640,6 +5604,81 @@ u32 bpf_tcp_sock_convert_ctx_access(enum bpf_access_type type,
offsetof(struct tcp_sock, rtt_min) +
offsetof(struct minmax_sample, v));
break;
case offsetof(struct bpf_tcp_sock, snd_cwnd):
BPF_TCP_SOCK_GET_COMMON(snd_cwnd);
break;
case offsetof(struct bpf_tcp_sock, srtt_us):
BPF_TCP_SOCK_GET_COMMON(srtt_us);
break;
case offsetof(struct bpf_tcp_sock, snd_ssthresh):
BPF_TCP_SOCK_GET_COMMON(snd_ssthresh);
break;
case offsetof(struct bpf_tcp_sock, rcv_nxt):
BPF_TCP_SOCK_GET_COMMON(rcv_nxt);
break;
case offsetof(struct bpf_tcp_sock, snd_nxt):
BPF_TCP_SOCK_GET_COMMON(snd_nxt);
break;
case offsetof(struct bpf_tcp_sock, snd_una):
BPF_TCP_SOCK_GET_COMMON(snd_una);
break;
case offsetof(struct bpf_tcp_sock, mss_cache):
BPF_TCP_SOCK_GET_COMMON(mss_cache);
break;
case offsetof(struct bpf_tcp_sock, ecn_flags):
BPF_TCP_SOCK_GET_COMMON(ecn_flags);
break;
case offsetof(struct bpf_tcp_sock, rate_delivered):
BPF_TCP_SOCK_GET_COMMON(rate_delivered);
break;
case offsetof(struct bpf_tcp_sock, rate_interval_us):
BPF_TCP_SOCK_GET_COMMON(rate_interval_us);
break;
case offsetof(struct bpf_tcp_sock, packets_out):
BPF_TCP_SOCK_GET_COMMON(packets_out);
break;
case offsetof(struct bpf_tcp_sock, retrans_out):
BPF_TCP_SOCK_GET_COMMON(retrans_out);
break;
case offsetof(struct bpf_tcp_sock, total_retrans):
BPF_TCP_SOCK_GET_COMMON(total_retrans);
break;
case offsetof(struct bpf_tcp_sock, segs_in):
BPF_TCP_SOCK_GET_COMMON(segs_in);
break;
case offsetof(struct bpf_tcp_sock, data_segs_in):
BPF_TCP_SOCK_GET_COMMON(data_segs_in);
break;
case offsetof(struct bpf_tcp_sock, segs_out):
BPF_TCP_SOCK_GET_COMMON(segs_out);
break;
case offsetof(struct bpf_tcp_sock, data_segs_out):
BPF_TCP_SOCK_GET_COMMON(data_segs_out);
break;
case offsetof(struct bpf_tcp_sock, lost_out):
BPF_TCP_SOCK_GET_COMMON(lost_out);
break;
case offsetof(struct bpf_tcp_sock, sacked_out):
BPF_TCP_SOCK_GET_COMMON(sacked_out);
break;
case offsetof(struct bpf_tcp_sock, bytes_received):
BPF_TCP_SOCK_GET_COMMON(bytes_received);
break;
case offsetof(struct bpf_tcp_sock, bytes_acked):
BPF_TCP_SOCK_GET_COMMON(bytes_acked);
break;
case offsetof(struct bpf_tcp_sock, dsack_dups):
BPF_TCP_SOCK_GET_COMMON(dsack_dups);
break;
case offsetof(struct bpf_tcp_sock, delivered):
BPF_TCP_SOCK_GET_COMMON(delivered);
break;
case offsetof(struct bpf_tcp_sock, delivered_ce):
BPF_TCP_SOCK_GET_COMMON(delivered_ce);
break;
case offsetof(struct bpf_tcp_sock, icsk_retransmits):
BPF_INET_SOCK_GET_COMMON(icsk_retransmits);
break;
}

return insn - insn_buf;
Expand Down Expand Up @@ -7913,9 +7952,6 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type,
SOCK_OPS_GET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ); \
} while (0)

CONVERT_COMMON_TCP_SOCK_FIELDS(struct bpf_sock_ops,
SOCK_OPS_GET_TCP_SOCK_FIELD);

if (insn > insn_buf)
return insn - insn_buf;

Expand Down Expand Up @@ -8085,6 +8121,69 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type,
SOCK_OPS_GET_OR_SET_FIELD(sk_txhash, sk_txhash,
struct sock, type);
break;
case offsetof(struct bpf_sock_ops, snd_cwnd):
SOCK_OPS_GET_TCP_SOCK_FIELD(snd_cwnd);
break;
case offsetof(struct bpf_sock_ops, srtt_us):
SOCK_OPS_GET_TCP_SOCK_FIELD(srtt_us);
break;
case offsetof(struct bpf_sock_ops, snd_ssthresh):
SOCK_OPS_GET_TCP_SOCK_FIELD(snd_ssthresh);
break;
case offsetof(struct bpf_sock_ops, rcv_nxt):
SOCK_OPS_GET_TCP_SOCK_FIELD(rcv_nxt);
break;
case offsetof(struct bpf_sock_ops, snd_nxt):
SOCK_OPS_GET_TCP_SOCK_FIELD(snd_nxt);
break;
case offsetof(struct bpf_sock_ops, snd_una):
SOCK_OPS_GET_TCP_SOCK_FIELD(snd_una);
break;
case offsetof(struct bpf_sock_ops, mss_cache):
SOCK_OPS_GET_TCP_SOCK_FIELD(mss_cache);
break;
case offsetof(struct bpf_sock_ops, ecn_flags):
SOCK_OPS_GET_TCP_SOCK_FIELD(ecn_flags);
break;
case offsetof(struct bpf_sock_ops, rate_delivered):
SOCK_OPS_GET_TCP_SOCK_FIELD(rate_delivered);
break;
case offsetof(struct bpf_sock_ops, rate_interval_us):
SOCK_OPS_GET_TCP_SOCK_FIELD(rate_interval_us);
break;
case offsetof(struct bpf_sock_ops, packets_out):
SOCK_OPS_GET_TCP_SOCK_FIELD(packets_out);
break;
case offsetof(struct bpf_sock_ops, retrans_out):
SOCK_OPS_GET_TCP_SOCK_FIELD(retrans_out);
break;
case offsetof(struct bpf_sock_ops, total_retrans):
SOCK_OPS_GET_TCP_SOCK_FIELD(total_retrans);
break;
case offsetof(struct bpf_sock_ops, segs_in):
SOCK_OPS_GET_TCP_SOCK_FIELD(segs_in);
break;
case offsetof(struct bpf_sock_ops, data_segs_in):
SOCK_OPS_GET_TCP_SOCK_FIELD(data_segs_in);
break;
case offsetof(struct bpf_sock_ops, segs_out):
SOCK_OPS_GET_TCP_SOCK_FIELD(segs_out);
break;
case offsetof(struct bpf_sock_ops, data_segs_out):
SOCK_OPS_GET_TCP_SOCK_FIELD(data_segs_out);
break;
case offsetof(struct bpf_sock_ops, lost_out):
SOCK_OPS_GET_TCP_SOCK_FIELD(lost_out);
break;
case offsetof(struct bpf_sock_ops, sacked_out):
SOCK_OPS_GET_TCP_SOCK_FIELD(sacked_out);
break;
case offsetof(struct bpf_sock_ops, bytes_received):
SOCK_OPS_GET_TCP_SOCK_FIELD(bytes_received);
break;
case offsetof(struct bpf_sock_ops, bytes_acked):
SOCK_OPS_GET_TCP_SOCK_FIELD(bytes_acked);
break;
case offsetof(struct bpf_sock_ops, sk):
*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
struct bpf_sock_ops_kern,
Expand Down
4 changes: 4 additions & 0 deletions net/ipv4/tcp_input.c
Original file line number Diff line number Diff line change
Expand Up @@ -778,6 +778,8 @@ static void tcp_rtt_estimator(struct sock *sk, long mrtt_us)
tp->rttvar_us -= (tp->rttvar_us - tp->mdev_max_us) >> 2;
tp->rtt_seq = tp->snd_nxt;
tp->mdev_max_us = tcp_rto_min_us(sk);

tcp_bpf_rtt(sk);
}
} else {
/* no previous measure. */
Expand All @@ -786,6 +788,8 @@ static void tcp_rtt_estimator(struct sock *sk, long mrtt_us)
tp->rttvar_us = max(tp->mdev_us, tcp_rto_min_us(sk));
tp->mdev_max_us = tp->rttvar_us;
tp->rtt_seq = tp->snd_nxt;

tcp_bpf_rtt(sk);
}
tp->srtt_us = max(1U, srtt);
}
Expand Down
1 change: 1 addition & 0 deletions samples/bpf/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -154,6 +154,7 @@ always += tcp_iw_kern.o
always += tcp_clamp_kern.o
always += tcp_basertt_kern.o
always += tcp_tos_reflect_kern.o
always += tcp_dumpstats_kern.o
always += xdp_redirect_kern.o
always += xdp_redirect_map_kern.o
always += xdp_redirect_cpu_kern.o
Expand Down
2 changes: 1 addition & 1 deletion samples/bpf/tcp_bpf.readme
Original file line number Diff line number Diff line change
Expand Up @@ -25,4 +25,4 @@ attached to the cgroupv2).

To remove (unattach) a socket_ops BPF program from a cgroupv2:

bpftool cgroup attach /tmp/cgroupv2/foo sock_ops pinned /sys/fs/bpf/tcp_prog
bpftool cgroup detach /tmp/cgroupv2/foo sock_ops pinned /sys/fs/bpf/tcp_prog
Loading

0 comments on commit e5a3e25

Please sign in to comment.