Skip to content
This repository has been archived by the owner on Apr 20, 2021. It is now read-only.

Commit

Permalink
Add eBPF connection tracking
Browse files Browse the repository at this point in the history
Introduces connection tracking via eBPF. This allows scope to get
notified of every connection event, without relying on the parsing of
/proc/$pid/net/tcp{,6} and /proc/$pid/fd/*, and therefore improve
performance.

The eBPF program is in a python script using bcc: docker/tcpv4tracer.py.
It is contributed upstream via iovisor/bcc#762
It is using kprobes on the following kernel functions:
- tcp_v4_connect
- inet_csk_accept
- tcp_close

It generates "connect", "accept" and "close" events containing the
connection tuple but also the pid and the netns.

The python script is piped into the Scope Probe and
probe/endpoint/ebpf.go maintains the list of connections. Similarly to
conntrack, we keep the dead connections for one iteration in order to
report the short-lived connections.

The code for parsing /proc/$pid/net/tcp{,6} and /proc/$pid/fd/* is still
there and still used at start-up because eBPF only brings us the events
and not the initial state. However, the /proc parsing for the initial
state is now done in foreground instead of background, via
newForegroundReader().

Scope Probe also falls back on the the old /proc parsing if eBPF is not
working (e.g. too old kernel, or missing kernel headers). There is also
a flag "probe.ebpf.connections" that could disable eBPF if set to false.

NAT resolutions on connections from eBPF works in the same way as it did
on connections from /proc: by using conntrack.

The Scope Docker image is bigger because we need a few more packages
for bcc:
- weaveworks/scope in current master:  22 MB
- weaveworks/scope with this patch:   147 MB

Limitations:
- [ ] Does not support IPv6
- [ ] Sets `procspied: true` on connections coming from eBPF
- [ ] Size of the Docker images: 6 times bigger
- [ ] Requirement on kernel headers
- [ ] Location of kernel headers: iovisor/bcc#743

Fixes weaveworks#1168 (walking /proc to
obtain connections is very expensive)

Fixes weaveworks#1260 (Short-lived
connections not tracked for containers in shared networking namespaces)
  • Loading branch information
Lorenzo Manacorda authored and alban committed Oct 31, 2016
1 parent 654ba35 commit 5962012
Show file tree
Hide file tree
Showing 12 changed files with 702 additions and 33 deletions.
8 changes: 3 additions & 5 deletions docker/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,14 +1,12 @@
FROM alpine:3.3
FROM zlim/bcc
MAINTAINER Weaveworks Inc <help@weave.works>
LABEL works.weave.role=system
WORKDIR /home/weave
RUN echo "http://dl-cdn.alpinelinux.org/alpine/edge/community" >>/etc/apk/repositories && \
apk add --update bash runit conntrack-tools iproute2 util-linux curl && \
rm -rf /var/cache/apk/*
RUN apt-get update -y && apt-get install -y bash runit conntrack iproute2 util-linux curl python bcc-tools python-bcc libbcc
ADD ./docker.tgz /
ADD ./demo.json /
ADD ./weave /usr/bin/
COPY ./scope ./runsvinit ./entrypoint.sh /home/weave/
COPY ./scope ./runsvinit ./entrypoint.sh ./tcpv4tracer.py /home/weave/
COPY ./run-app /etc/service/app/run
COPY ./run-probe /etc/service/probe/run
EXPOSE 4040
Expand Down
291 changes: 291 additions & 0 deletions docker/tcpv4tracer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,291 @@
#!/usr/bin/python
#
# tcpv4tracer Trace TCP IPv4 connections.
# For Linux, uses BCC, eBPF. Embedded C.
#
# USAGE: tcpv4tracer [-h] [-p PID]
#
from __future__ import print_function
from bcc import BPF

import argparse
import ctypes

parser = argparse.ArgumentParser(
description="Trace TCP IPv4 connections",
formatter_class=argparse.RawDescriptionHelpFormatter)
parser.add_argument("-p", "--pid",
help="trace this PID only")
args = parser.parse_args()

# define BPF program
bpf_text = """
#include <uapi/linux/ptrace.h>
#include <net/sock.h>
#include <net/inet_sock.h>
#include <net/net_namespace.h>
#include <bcc/proto.h>
#define TCP_EVENT_TYPE_CONNECT 1
#define TCP_EVENT_TYPE_ACCEPT 2
#define TCP_EVENT_TYPE_CLOSE 3
struct tcp_event_t {
u32 type;
u32 netns;
u32 pid;
u32 saddr;
u32 daddr;
u16 sport;
u16 dport;
};
BPF_PERF_OUTPUT(tcp_event);
BPF_HASH(connectsock, u64, struct sock *);
BPF_HASH(closesock, u64, struct sock *);
int kprobe__tcp_v4_connect(struct pt_regs *ctx, struct sock *sk)
{
u64 pid = bpf_get_current_pid_tgid();
##FILTER_PID##
// stash the sock ptr for lookup on return
connectsock.update(&pid, &sk);
return 0;
};
int kretprobe__tcp_v4_connect(struct pt_regs *ctx)
{
int ret = PT_REGS_RC(ctx);
u64 pid = bpf_get_current_pid_tgid();
struct sock **skpp;
skpp = connectsock.lookup(&pid);
if (skpp == 0) {
return 0; // missed entry
}
if (ret != 0) {
// failed to send SYNC packet, may not have populated
// socket __sk_common.{skc_rcv_saddr, ...}
connectsock.delete(&pid);
return 0;
}
// pull in details
struct sock *skp = *skpp;
struct ns_common *ns;
u32 saddr = 0, daddr = 0;
u16 sport = 0, dport = 0;
u32 net_ns_inum = 0;
bpf_probe_read(&sport, sizeof(sport), &((struct inet_sock *)skp)->inet_sport);
bpf_probe_read(&saddr, sizeof(saddr), &skp->__sk_common.skc_rcv_saddr);
bpf_probe_read(&daddr, sizeof(daddr), &skp->__sk_common.skc_daddr);
bpf_probe_read(&dport, sizeof(dport), &skp->__sk_common.skc_dport);
// Get network namespace id, if kernel supports it
#ifdef CONFIG_NET_NS
possible_net_t skc_net = {0,};
bpf_probe_read(&skc_net, sizeof(skc_net), &skp->__sk_common.skc_net);
bpf_probe_read(&net_ns_inum, sizeof(net_ns_inum), &skc_net.net->ns.inum);
#else
net_ns_inum = 0;
#endif
// output
struct tcp_event_t evt = {
.type = TCP_EVENT_TYPE_CONNECT,
.pid = pid >> 32,
.saddr = saddr,
.daddr = daddr,
.sport = ntohs(sport),
.dport = ntohs(dport),
.netns = net_ns_inum,
};
u16 family = 0;
bpf_probe_read(&family, sizeof(family), &skp->__sk_common.skc_family);
tcp_event.perf_submit(ctx, &evt, sizeof(evt));
connectsock.delete(&pid);
return 0;
}
int kprobe__tcp_close(struct pt_regs *ctx, struct sock *sk)
{
u64 pid = bpf_get_current_pid_tgid();
##FILTER_PID##
// stash the sock ptr for lookup on return
closesock.update(&pid, &sk);
return 0;
};
int kretprobe__tcp_close(struct pt_regs *ctx)
{
u64 pid = bpf_get_current_pid_tgid();
struct sock **skpp;
skpp = closesock.lookup(&pid);
if (skpp == 0) {
return 0; // missed entry
}
// pull in details
struct sock *skp = *skpp;
u32 saddr = 0, daddr = 0;
u16 sport = 0, dport = 0;
u32 net_ns_inum = 0;
bpf_probe_read(&saddr, sizeof(saddr), &skp->__sk_common.skc_rcv_saddr);
bpf_probe_read(&daddr, sizeof(daddr), &skp->__sk_common.skc_daddr);
bpf_probe_read(&sport, sizeof(sport), &((struct inet_sock *)skp)->inet_sport);
bpf_probe_read(&dport, sizeof(dport), &skp->__sk_common.skc_dport);
// Get network namespace id, if kernel supports it
#ifdef CONFIG_NET_NS
possible_net_t skc_net = {0,};
bpf_probe_read(&skc_net, sizeof(skc_net), &skp->__sk_common.skc_net);
bpf_probe_read(&net_ns_inum, sizeof(net_ns_inum), &skc_net.net->ns.inum);
#else
net_ns_inum = 0;
#endif
// output
struct tcp_event_t evt = {
.type = TCP_EVENT_TYPE_CLOSE,
.pid = pid >> 32,
.saddr = saddr,
.daddr = daddr,
.sport = ntohs(sport),
.dport = ntohs(dport),
.netns = net_ns_inum,
};
u16 family = 0;
bpf_probe_read(&family, sizeof(family), &skp->__sk_common.skc_family);
// do not send event if IP address is 0.0.0.0 or port is 0
if (evt.saddr != 0 && evt.daddr != 0 && evt.sport != 0 && evt.dport != 0) {
tcp_event.perf_submit(ctx, &evt, sizeof(evt));
}
closesock.delete(&pid);
return 0;
}
int kretprobe__inet_csk_accept(struct pt_regs *ctx)
{
struct sock *newsk = (struct sock *)PT_REGS_RC(ctx);
u64 pid = bpf_get_current_pid_tgid();
##FILTER_PID##
if (newsk == NULL)
return 0;
// check this is TCP
u8 protocol = 0;
// workaround for reading the sk_protocol bitfield:
bpf_probe_read(&protocol, 1, (void *)((long)&newsk->sk_wmem_queued) - 3);
if (protocol != IPPROTO_TCP)
return 0;
// pull in details
u16 family = 0, lport = 0, dport = 0;
u32 net_ns_inum = 0;
bpf_probe_read(&family, sizeof(family), &newsk->__sk_common.skc_family);
bpf_probe_read(&lport, sizeof(lport), &newsk->__sk_common.skc_num);
bpf_probe_read(&dport, sizeof(dport), &newsk->__sk_common.skc_dport);
// Get network namespace id, if kernel supports it
#ifdef CONFIG_NET_NS
possible_net_t skc_net = {0,};
bpf_probe_read(&skc_net, sizeof(skc_net), &newsk->__sk_common.skc_net);
bpf_probe_read(&net_ns_inum, sizeof(net_ns_inum), &skc_net.net->ns.inum);
#else
net_ns_inum = 0;
#endif
if (family == AF_INET) {
struct tcp_event_t evt = {
.type = TCP_EVENT_TYPE_ACCEPT,
.pid = pid >> 32,
.netns = net_ns_inum,
};
bpf_probe_read(&evt.saddr, sizeof(u32),
&newsk->__sk_common.skc_rcv_saddr);
bpf_probe_read(&evt.daddr, sizeof(u32),
&newsk->__sk_common.skc_daddr);
evt.sport = lport;
evt.dport = ntohs(dport);
tcp_event.perf_submit(ctx, &evt, sizeof(evt));
}
// else drop
return 0;
}
"""

class TCPEvt(ctypes.Structure):
_fields_ = [
("type", ctypes.c_uint),
("netns", ctypes.c_uint),
("pid", ctypes.c_uint),
("saddr", ctypes.c_uint),
("daddr", ctypes.c_uint),
("sport", ctypes.c_ushort),
("dport", ctypes.c_ushort),
]

def print_event(cpu, data, size):
event = ctypes.cast(data, ctypes.POINTER(TCPEvt)).contents
if event.type == 1:
type_str = "connect"
elif event.type == 2:
type_str = "accept"
elif event.type == 3:
type_str = "close"
else:
type_str = "unknown-" + str(event.type)

print("%s %s %s %s %s %s %s" % (type_str, event.pid,
inet_ntoa(event.saddr),
inet_ntoa(event.daddr),
event.sport,
event.dport,
event.netns,
))

if args.pid:
bpf_text = bpf_text.replace('##FILTER_PID##',
'if (pid != %s) { return 0; }' % args.pid)
else:
bpf_text = bpf_text.replace('##FILTER_PID##', '')

# initialize BPF
b = BPF(text=bpf_text)

# header
print("TYPE PID SADDR DADDR SPORT DPORT NETNS")

def inet_ntoa(addr):
dq = ''
for i in range(0, 4):
dq = dq + str(addr & 0xff)
if (i != 3):
dq = dq + '.'
addr = addr >> 8
return dq

b["tcp_event"].open_perf_buffer(print_event)
while True:
b.kprobe_poll()
Loading

0 comments on commit 5962012

Please sign in to comment.