diff --git a/cmd/nerdctl/run.go b/cmd/nerdctl/run.go index 0815713b2ea..32471114f76 100644 --- a/cmd/nerdctl/run.go +++ b/cmd/nerdctl/run.go @@ -835,6 +835,15 @@ func getContainerStateDirPath(cmd *cobra.Command, dataStore, id string) (string, } func withContainerLabels(cmd *cobra.Command) ([]containerd.NewContainerOpts, error) { + labelMap, err := readKVStringsMapfFromLabel(cmd) + if err != nil { + return nil, err + } + o := containerd.WithAdditionalContainerLabels(labelMap) + return []containerd.NewContainerOpts{o}, nil +} + +func readKVStringsMapfFromLabel(cmd *cobra.Command) (map[string]string, error) { labelsMap, err := cmd.Flags().GetStringArray("label") if err != nil { return nil, err @@ -849,8 +858,8 @@ func withContainerLabels(cmd *cobra.Command) ([]containerd.NewContainerOpts, err if err != nil { return nil, err } - o := containerd.WithAdditionalContainerLabels(strutil.ConvertKVStringsToMap(labels)) - return []containerd.NewContainerOpts{o}, nil + + return strutil.ConvertKVStringsToMap(labels), nil } func withInternalLabels(ns, name, hostname, containerStateDir string, extraHosts, networks []string, ports []gocni.PortMapping, logURI string, anonVolumes []string, pidFile, platform string) (containerd.NewContainerOpts, error) { diff --git a/cmd/nerdctl/run_linux.go b/cmd/nerdctl/run_linux.go index 232682b7ef9..c2f7e9c8743 100644 --- a/cmd/nerdctl/run_linux.go +++ b/cmd/nerdctl/run_linux.go @@ -21,6 +21,7 @@ import ( "fmt" "strings" + "github.com/containerd/nerdctl/pkg/bypass4netnsutil" "github.com/containerd/nerdctl/pkg/rootlessutil" "github.com/containerd/nerdctl/pkg/strutil" "github.com/docker/go-units" @@ -82,6 +83,17 @@ func setPlatformOptions(opts []oci.SpecOpts, cmd *cobra.Command, id string) ([]o opts = append(opts, secOpts...) } + labelsMap, err := readKVStringsMapfFromLabel(cmd) + if err != nil { + return nil, err + } + b4nnOpts, err := bypass4netnsutil.GenerateBypass4netnsOpts(securityOptsMaps, labelsMap, id) + if err != nil { + return nil, err + } else { + opts = append(opts, b4nnOpts...) + } + capAdd, err := cmd.Flags().GetStringSlice("cap-add") if err != nil { return nil, err diff --git a/docs/experimental.md b/docs/experimental.md index 4b42dede8d4..d5750b19c20 100644 --- a/docs/experimental.md +++ b/docs/experimental.md @@ -8,3 +8,4 @@ The following features are experimental and subject to change: eStargz itself is out of experimental. - [Image Distribution on IPFS](./ipfs.md) - [Image Sign and Verify (cosign)](./cosign.md) +- [Rootless container networking acceleration with bypass4netns](./rootless.md#bypass4netns) \ No newline at end of file diff --git a/docs/rootless.md b/docs/rootless.md index 3311969d389..5e4cc6b6b47 100644 --- a/docs/rootless.md +++ b/docs/rootless.md @@ -104,6 +104,34 @@ $ nerdctl run -it --rm ghcr.io/stargz-containers/alpine:3.10.2-esgz See https://github.com/containerd/stargz-snapshotter/blob/master/docs/pre-converted-images.md for the image list. +## bypass4netns +[bypass4netns(https://github.com/rootless-containers/bypass4netns)](https://github.com/rootless-containers/bypass4netns) is an accelerator for rootless networking. + +This improves **outgoing or incoming (with --publish option) networking performance.** + +The performance benchmark with iperf3 on Ubuntu 21.10 on Hyper-V VM is shown below. +| iperf3 benchmark | without bypass4netns | with bypass4netns | +| ----------------- | -------------------- | ----------------- | +| container -> host | 0.398 Gbps | **42.2 Gbps** | +| host -> container | 20.6 Gbps | **47.4 Gbps** | + +This benchmark can be reproduced with [https://github.com/rootless-containers/bypass4netns/blob/f009d96139e9e38ce69a2ea8a9a746349bad273c/Vagrantfile](https://github.com/rootless-containers/bypass4netns/blob/f009d96139e9e38ce69a2ea8a9a746349bad273c/Vagrantfile) + +Acceleration with bypass4netns is available with `--label nerdctl/bypass4netns=true` +Example +```console +$ nerdctl run -it --rm -p 8080:80 --label nerdctl/bypass4netns=true alpine +``` + +More detail is available at [https://github.com/rootless-containers/bypass4netns/blob/master/README.md](https://github.com/rootless-containers/bypass4netns/blob/master/README.md) + +### :warning: Caveats :warning: +Subnets(`127.0.0.0/8, 10.0.0.0/8`) not handled by bypass4netns is hard-coded. +Container networks which are not contained in the subnets can be broken or can cause problems. + +### TODO +- Remove hard-coded subnets in pkg/bypass4netnsutil/bypass.go + ## Troubleshooting ### Hint to Fedora users diff --git a/go.mod b/go.mod index 900dbd6dfaf..0d850b639b1 100644 --- a/go.mod +++ b/go.mod @@ -37,6 +37,7 @@ require ( github.com/opencontainers/image-spec v1.0.3-0.20211215212317-ea0209f50ae1 github.com/opencontainers/runtime-spec v1.0.3-0.20211214071223-8958f93039ab github.com/pelletier/go-toml v1.9.4 + github.com/rootless-containers/bypass4netns v0.1.1-0.20220214050838-562b4845f47e github.com/rootless-containers/rootlesskit v0.14.6 github.com/sirupsen/logrus v1.8.1 github.com/spf13/cobra v1.3.0 diff --git a/go.sum b/go.sum index fc04379ebbe..fe38c20c699 100644 --- a/go.sum +++ b/go.sum @@ -1563,6 +1563,7 @@ github.com/opentracing/opentracing-go v1.1.0/go.mod h1:UkNAQd3GIcIGf0SeVgPpRdFSt github.com/opentracing/opentracing-go v1.2.0 h1:uEJPy/1a5RIPAJ0Ov+OIO8OxWu77jEv+1B0VhjKrZUs= github.com/opentracing/opentracing-go v1.2.0/go.mod h1:GxEUsuufX4nBwe+T+Wl9TAgYrxe9dPLANfrWvHYVTgc= github.com/openzipkin/zipkin-go v0.1.1/go.mod h1:NtoC/o8u3JlF1lSlyPNswIbeQH9bJTmOf0Erfk+hxe8= +github.com/oraoto/go-pidfd v0.1.2-0.20210402155345-46bf1ba22e22/go.mod h1:gPWelSU60MvzRX+ToMlKj9lZRkeqybg6qy8cy4+rZWE= github.com/otiai10/copy v1.2.0/go.mod h1:rrF5dJ5F0t/EWSYODDu4j9/vEeYHMkc8jt0zJChqQWw= github.com/otiai10/curr v0.0.0-20150429015615-9b4961190c95/go.mod h1:9qAhocn7zKJG+0mI8eUu6xqkFDYS2kb2saOteoSB3cE= github.com/otiai10/curr v1.0.0/go.mod h1:LskTG5wDwr8Rs+nNQ+1LlxRjAtTZZjtJW4rMXl6j4vs= @@ -1654,6 +1655,8 @@ github.com/rogpeppe/fastuuid v0.0.0-20150106093220-6724a57986af/go.mod h1:XWv6So github.com/rogpeppe/fastuuid v1.2.0/go.mod h1:jVj6XXZzXRy/MSR5jhDC/2q6DgLz+nrA6LYCDYWNEvQ= github.com/rogpeppe/go-internal v1.3.0/go.mod h1:M8bDsm7K2OlrFYOpmOWEs/qY81heoFRclV5y23lUDJ4= github.com/rogpeppe/go-internal v1.6.2/go.mod h1:xXDCJY+GAPziupqXw64V24skbSoqbTEfhy4qGm1nDQc= +github.com/rootless-containers/bypass4netns v0.1.1-0.20220214050838-562b4845f47e h1:z3xZoOlU1OvHXwktDJiK69nxhniKV7lyO3+agoipzTY= +github.com/rootless-containers/bypass4netns v0.1.1-0.20220214050838-562b4845f47e/go.mod h1:YOVRdnzEO3JHcxvobbHdUowG+98z0SIgBFQ6KqAQo9U= github.com/rootless-containers/rootlesskit v0.14.6 h1:5kvJK6eeUtWZz1mYegu5S7DHOahq93K+jbc/mz+hbFQ= github.com/rootless-containers/rootlesskit v0.14.6/go.mod h1:uHPTRoPO6ZdOl2q99ZKOK14PJAwepfNKh6hV57AOZYQ= github.com/rs/cors v1.7.0 h1:+88SsELBHx5r+hZ8TCkggzSstaWNbDvThkVK8H6f9ik= @@ -1679,6 +1682,7 @@ github.com/sclevine/spec v1.2.0/go.mod h1:W4J29eT/Kzv7/b9IWLB055Z+qvVC9vt0Arko24 github.com/sean-/seed v0.0.0-20170313163322-e2103e2c3529/go.mod h1:DxrIzT+xaE7yg65j358z/aeFdxmN0P9QXhEzd20vsDc= github.com/seccomp/libseccomp-golang v0.9.1/go.mod h1:GbW5+tmTXfcxTToHLXlScSlAvWlF4P2Ca7zGrPiEpWo= github.com/seccomp/libseccomp-golang v0.9.2-0.20210429002308-3879420cc921/go.mod h1:JA8cRccbGaA1s33RQf7Y1+q9gHmZX1yB/z9WDN1C6fg= +github.com/seccomp/libseccomp-golang v0.9.2-0.20220128023657-2a7184820543/go.mod h1:JA8cRccbGaA1s33RQf7Y1+q9gHmZX1yB/z9WDN1C6fg= github.com/securego/gosec/v2 v2.9.1/go.mod h1:oDcDLcatOJxkCGaCaq8lua1jTnYf6Sou4wdiJ1n4iHc= github.com/sergi/go-diff v1.0.0/go.mod h1:0CfEIISq7TuYL3j771MWULgwwjU+GofnZX9QAmXWZgo= github.com/sergi/go-diff v1.1.0/go.mod h1:STckp+ISIX8hZLjrqAeVduY0gWCT9IjLuqbuNXdaHfM= @@ -1851,6 +1855,7 @@ github.com/vishvananda/netns v0.0.0-20200728191858-db3c7e526aae/go.mod h1:DD4vA1 github.com/vishvananda/netns v0.0.0-20210104183010-2eb08e3e575f/go.mod h1:DD4vA1DwXk04H54A1oHXtwZmA0grkVMdPxx/VGLCah0= github.com/vishvananda/netns v0.0.0-20211101163701-50045581ed74 h1:gga7acRE695APm9hlsSMoOoE65U4/TcqNj90mc69Rlg= github.com/vishvananda/netns v0.0.0-20211101163701-50045581ed74/go.mod h1:DD4vA1DwXk04H54A1oHXtwZmA0grkVMdPxx/VGLCah0= +github.com/vtolstov/go-ioctl v0.0.0-20151206205506-6be9cced4810/go.mod h1:dF0BBJ2YrV1+2eAIyEI+KeSidgA6HqoIP1u5XTlMq/o= github.com/warpfork/go-wish v0.0.0-20180510122957-5ad1f5abf436/go.mod h1:x6AKhvSSexNrVSrViXSHUEbICjmGXhtgABaHIySUSGw= github.com/warpfork/go-wish v0.0.0-20190328234359-8b3e70f8e830/go.mod h1:x6AKhvSSexNrVSrViXSHUEbICjmGXhtgABaHIySUSGw= github.com/warpfork/go-wish v0.0.0-20200122115046-b9ea61034e4a h1:G++j5e0OC488te356JvdhaM8YS6nMsjLAYF7JxCv07w= diff --git a/pkg/bypass4netnsutil/bypass.go b/pkg/bypass4netnsutil/bypass.go new file mode 100644 index 00000000000..e99d9e397d5 --- /dev/null +++ b/pkg/bypass4netnsutil/bypass.go @@ -0,0 +1,87 @@ +/* + Copyright The containerd Authors. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package bypass4netnsutil + +import ( + "context" + "path/filepath" + + "github.com/containerd/containerd/errdefs" + gocni "github.com/containerd/go-cni" + b4nnapi "github.com/rootless-containers/bypass4netns/pkg/api" + "github.com/rootless-containers/bypass4netns/pkg/api/daemon/client" +) + +func NewBypass4netnsCNIBypassManager(client client.Client) (*Bypass4netnsCNIBypassManager, error) { + if client == nil { + return nil, errdefs.ErrInvalidArgument + } + pm := &Bypass4netnsCNIBypassManager{ + Client: client, + } + return pm, nil +} + +type Bypass4netnsCNIBypassManager struct { + client.Client +} + +func (b4nnm *Bypass4netnsCNIBypassManager) StartBypass(ctx context.Context, ports []gocni.PortMapping, id, stateDir string) error { + socketPath, err := GetSocketPathByID(id) + if err != nil { + return err + } + pidFilePath, err := GetPidFilePathByID(id) + if err != nil { + return err + } + logFilePath := filepath.Join(stateDir, "bypass4netns.log") + + spec := b4nnapi.BypassSpec{ + ID: id, + SocketPath: socketPath, + PidFilePath: pidFilePath, + LogFilePath: logFilePath, + // TODO: Remove hard-coded subnets + IgnoreSubnets: []string{"127.0.0.0/8", "10.0.0.0/8"}, + } + portMap := []b4nnapi.PortSpec{} + for _, p := range ports { + portMap = append(portMap, b4nnapi.PortSpec{ + ParentIP: p.HostIP, + ParentPort: int(p.HostPort), + ChildPort: int(p.ContainerPort), + Protos: []string{p.Protocol}, + }) + } + spec.PortMapping = portMap + _, err = b4nnm.BypassManager().StartBypass(ctx, spec) + if err != nil { + return err + } + + return nil +} + +func (b4nnm *Bypass4netnsCNIBypassManager) StopBypass(ctx context.Context, id string) error { + err := b4nnm.BypassManager().StopBypass(ctx, id) + if err != nil { + return err + } + + return nil +} diff --git a/pkg/bypass4netnsutil/bypass4netnsutil.go b/pkg/bypass4netnsutil/bypass4netnsutil.go new file mode 100644 index 00000000000..57d92a95e01 --- /dev/null +++ b/pkg/bypass4netnsutil/bypass4netnsutil.go @@ -0,0 +1,143 @@ +/* + Copyright The containerd Authors. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package bypass4netnsutil + +import ( + "context" + "fmt" + "os" + "path/filepath" + "strconv" + + "github.com/containerd/containerd/containers" + "github.com/containerd/containerd/oci" + "github.com/containerd/nerdctl/pkg/labels" + "github.com/opencontainers/runtime-spec/specs-go" + b4nnoci "github.com/rootless-containers/bypass4netns/pkg/oci" +) + +func generateSecurityOpt(listenerPath string) (oci.SpecOpts, error) { + opt := func(_ context.Context, _ oci.Client, _ *containers.Container, s *specs.Spec) error { + s.Linux.Seccomp = b4nnoci.GetDefaultSeccompProfile(listenerPath) + return nil + } + return opt, nil +} + +func GenerateBypass4netnsOpts(securityOptsMaps map[string]string, labelMaps map[string]string, id string) ([]oci.SpecOpts, error) { + b4nn, ok := labelMaps[labels.Bypass4netns] + if !ok { + return nil, nil + } + + b4nnEnable, err := strconv.ParseBool(b4nn) + if err != nil { + return nil, err + } + + if !b4nnEnable { + return nil, nil + } + + if _, ok := securityOptsMaps["seccomp"]; ok { + return nil, fmt.Errorf("--security-opt seccomp cannot be specified if bypass4netns enabled") + } + + socketPath, err := GetSocketPathByID(id) + if err != nil { + return nil, err + } + + err = CreateSocketDir() + if err != nil { + return nil, err + } + + opts := []oci.SpecOpts{} + opt, err := generateSecurityOpt(socketPath) + if err != nil { + return nil, err + } + opts = append(opts, opt) + + return opts, nil +} + +func getXDGRuntimeDir() (string, error) { + if xrd := os.Getenv("XDG_RUNTIME_DIR"); xrd != "" { + return xrd, nil + } + return "", fmt.Errorf("environment variable XDG_RUNTIME_DIR is not set") +} + +func CreateSocketDir() error { + xdgRuntimeDir, err := getXDGRuntimeDir() + if err != nil { + return err + } + dirPath := filepath.Join(xdgRuntimeDir, "bypass4netns") + if _, err := os.Stat(dirPath); os.IsNotExist(err) { + err = os.MkdirAll(dirPath, 0775) + if err != nil { + return err + } + } + + return nil +} + +func GetBypass4NetnsdDefaultSocketPath() (string, error) { + xdgRuntimeDir, err := getXDGRuntimeDir() + if err != nil { + return "", err + } + + return filepath.Join(xdgRuntimeDir, "bypass4netnsd.sock"), nil +} + +func GetSocketPathByID(id string) (string, error) { + xdgRuntimeDir, err := getXDGRuntimeDir() + if err != nil { + return "", err + } + + socketPath := filepath.Join(xdgRuntimeDir, "bypass4netns", id[0:15]+".sock") + return socketPath, nil +} + +func GetPidFilePathByID(id string) (string, error) { + xdgRuntimeDir, err := getXDGRuntimeDir() + if err != nil { + return "", err + } + + socketPath := filepath.Join(xdgRuntimeDir, "bypass4netns", id[0:15]+".pid") + return socketPath, nil +} + +func IsBypass4netnsEnabled(annotations map[string]string) (bool, error) { + if b4nn, ok := annotations[labels.Bypass4netns]; ok { + b4nnEnable, err := strconv.ParseBool(b4nn) + if err != nil { + return false, err + } + + return b4nnEnable, nil + } + + return false, nil +} diff --git a/pkg/labels/labels.go b/pkg/labels/labels.go index 5a0b2d0cd2d..b43063b6d2b 100644 --- a/pkg/labels/labels.go +++ b/pkg/labels/labels.go @@ -69,4 +69,9 @@ const ( // Platform is the normalized platform string like "linux/ppc64le". Platform = Prefix + "platform" + + // Bypass4netns is the flag for acceleration with bypass4netns + // Boolean value which can be parsed with strconv.ParseBool() is required. + // (like "nerdctl/bypass4netns=true" or "nerdctl/bypass4netns=false") + Bypass4netns = Prefix + "bypass4netns" ) diff --git a/pkg/ocihook/ocihook.go b/pkg/ocihook/ocihook.go index 5477df34b64..a64b78ba316 100644 --- a/pkg/ocihook/ocihook.go +++ b/pkg/ocihook/ocihook.go @@ -29,6 +29,7 @@ import ( "github.com/containerd/containerd/cmd/ctr/commands" gocni "github.com/containerd/go-cni" + "github.com/containerd/nerdctl/pkg/bypass4netnsutil" "github.com/containerd/nerdctl/pkg/dnsutil/hostsstore" "github.com/containerd/nerdctl/pkg/labels" "github.com/containerd/nerdctl/pkg/netutil" @@ -38,6 +39,7 @@ import ( dopts "github.com/docker/cli/opts" "github.com/opencontainers/runtime-spec/specs-go" + b4nndclient "github.com/rootless-containers/bypass4netns/pkg/api/daemon/client" rlkclient "github.com/rootless-containers/rootlesskit/pkg/api/client" "github.com/sirupsen/logrus" ) @@ -191,11 +193,26 @@ func newHandlerOpts(state *specs.State, dataStore, cniPath, cniNetconfPath strin return nil, err } } + if rootlessutil.IsRootlessChild() { o.rootlessKitClient, err = rootlessutil.NewRootlessKitClient() if err != nil { return nil, err } + b4nnEnabled, err := bypass4netnsutil.IsBypass4netnsEnabled(o.state.Annotations) + if err != nil { + return nil, err + } + if b4nnEnabled { + socketPath, err := bypass4netnsutil.GetBypass4NetnsdDefaultSocketPath() + if err != nil { + return nil, err + } + o.bypassClient, err = b4nndclient.New(socketPath) + if err != nil { + return nil, err + } + } } return o, nil } @@ -209,6 +226,7 @@ type handlerOpts struct { cniNames []string fullID string rootlessKitClient rlkclient.Client + bypassClient b4nndclient.Client extraHosts map[string]string // ip:host } @@ -324,18 +342,35 @@ func onCreateRuntime(opts *handlerOpts) error { hsMeta.Networks[cniName] = cniResRaw[i] } + b4nnEnabled, err := bypass4netnsutil.IsBypass4netnsEnabled(opts.state.Annotations) + if err != nil { + return err + } + if err := hs.Acquire(hsMeta); err != nil { return err } - if len(opts.ports) > 0 && rootlessutil.IsRootlessChild() { - pm, err := rootlessutil.NewRootlessCNIPortManager(opts.rootlessKitClient) - if err != nil { - return err - } - for _, p := range opts.ports { - if err := pm.ExposePort(ctx, p); err != nil { + + if rootlessutil.IsRootlessChild() { + if b4nnEnabled { + bm, err := bypass4netnsutil.NewBypass4netnsCNIBypassManager(opts.bypassClient) + if err != nil { return err } + err = bm.StartBypass(ctx, opts.ports, opts.state.ID, opts.state.Annotations[labels.StateDir]) + if err != nil { + return err + } + } else if len(opts.ports) > 0 { + pm, err := rootlessutil.NewRootlessCNIPortManager(opts.rootlessKitClient) + if err != nil { + return err + } + for _, p := range opts.ports { + if err := pm.ExposePort(ctx, p); err != nil { + return err + } + } } } } @@ -345,15 +380,31 @@ func onCreateRuntime(opts *handlerOpts) error { func onPostStop(opts *handlerOpts) error { ctx := context.Background() if opts.cni != nil { - if len(opts.ports) > 0 && rootlessutil.IsRootlessChild() { - pm, err := rootlessutil.NewRootlessCNIPortManager(opts.rootlessKitClient) - if err != nil { - return err - } - for _, p := range opts.ports { - if err := pm.UnexposePort(ctx, p); err != nil { + var err error + b4nnEnabled, err := bypass4netnsutil.IsBypass4netnsEnabled(opts.state.Annotations) + if err != nil { + return err + } + if rootlessutil.IsRootlessChild() { + if b4nnEnabled { + bm, err := bypass4netnsutil.NewBypass4netnsCNIBypassManager(opts.bypassClient) + if err != nil { + return err + } + err = bm.StopBypass(ctx, opts.state.ID) + if err != nil { return err } + } else if len(opts.ports) > 0 { + pm, err := rootlessutil.NewRootlessCNIPortManager(opts.rootlessKitClient) + if err != nil { + return err + } + for _, p := range opts.ports { + if err := pm.UnexposePort(ctx, p); err != nil { + return err + } + } } } portMapOpts, err := getPortMapOpts(opts)