setns then cloneflags when joining user namespace

for now, when a custom user namespace is required, we joins all existing namespaces first before creating new ones. Idea: this can probably be generalized so we performs all setns first and clones new namespaces after, but probably also requires setting uid/gid maps in C. Signed-off-by: Daniel, Dao Quang Minh <dqminh89@gmail.com>
opencontainers · Jul 13, 2015 · 652a368 · 652a368
1 parent d07e5fc
commit 652a368
Show file tree

Hide file tree

Showing 5 changed files with 145 additions and 26 deletions.
diff --git a/libcontainer/configs/namespaces_unix.go b/libcontainer/configs/namespaces_unix.go
@@ -118,3 +118,11 @@ func (n *Namespaces) index(t NamespaceType) int {
 func (n *Namespaces) Contains(t NamespaceType) bool {
 	return n.index(t) != -1
 }
+
+func (n *Namespaces) PathOf(t NamespaceType) string {
+	i := n.index(t)
+	if i == -1 {
+		return ""
+	}
+	return (*n)[i].Path
+}
diff --git a/libcontainer/container_linux.go b/libcontainer/container_linux.go
@@ -157,6 +157,10 @@ func (c *linuxContainer) commandTemplate(p *Process, childPipe *os.File) (*exec.
 }
 
 func (c *linuxContainer) newInitProcess(p *Process, cmd *exec.Cmd, parentPipe, childPipe *os.File) (*initProcess, error) {
+	// set init process environment
+	env := []string{"_LIBCONTAINER_INITTYPE=standard"}
+	var doClone bool
+
 	cloneFlags := c.config.Namespaces.CloneFlags()
 	if cloneFlags&syscall.CLONE_NEWUSER != 0 {
 		if err := c.addUidGidMappings(cmd.SysProcAttr); err != nil {
@@ -168,17 +172,23 @@ func (c *linuxContainer) newInitProcess(p *Process, cmd *exec.Cmd, parentPipe, c
 			cmd.SysProcAttr.Credential = &syscall.Credential{}
 		}
 	}
-	cmd.SysProcAttr.Cloneflags = cloneFlags
+	// if we required to create a new user namespace, delegates to golang
+	// implementation to be able to set uid/gid mappings in a standard way,
+	// otherwise do it inside nsexec by passing the clone flags because we dont
+	// have to perform any additional setup when start a new process.
+	if c.config.Namespaces.PathOf(configs.NEWUSER) == "" {
+		cmd.SysProcAttr.Cloneflags = cloneFlags
+	} else {
+		// let nsexec clone namespaces instead of go
+		doClone = true
+		cmd.Env = append(cmd.Env, fmt.Sprintf("_LIBCONTAINER_CLONEFLAGS=%d",
+			cloneFlags))
+	}
 
-	// set init process environment
-	env := []string{"_LIBCONTAINER_INITTYPE=standard"}
-	var joinNamespaces configs.Namespaces
-	var doClone bool
 	nsMaps := make(map[configs.NamespaceType]string)
 	for _, ns := range c.config.Namespaces {
 		if ns.Path != "" {
 			nsMaps[ns.Type] = ns.Path
-			joinNamespaces = append(joinNamespaces, ns)
 			if ns.Type == configs.NEWPID {
 				doClone = true
 			}
@@ -198,13 +208,12 @@ func (c *linuxContainer) newInitProcess(p *Process, cmd *exec.Cmd, parentPipe, c
 	cmd.Env = append(cmd.Env, env...)
 
 	return &initProcess{
-		cmd:            cmd,
-		childPipe:      childPipe,
-		parentPipe:     parentPipe,
-		manager:        c.cgroupManager,
-		config:         c.newInitConfig(p),
-		joinNamespaces: joinNamespaces,
-		doClone:        doClone,
+		cmd:        cmd,
+		childPipe:  childPipe,
+		parentPipe: parentPipe,
+		manager:    c.cgroupManager,
+		config:     c.newInitConfig(p),
+		doClone:    doClone,
 	}, nil
 }
 
@@ -829,9 +838,8 @@ func (c *linuxContainer) orderNamespacePaths(namespaces map[configs.NamespaceTyp
 		configs.NEWPID,
 		configs.NEWNS,
 	}
-	// For now, only join user namespace if this is an exec in process and the
-	// container supports user namespace
-	if !doInit && c.config.Namespaces.Contains(configs.NEWUSER) {
+	// join userns if the init process explicitly requires NEWUSER
+	if c.config.Namespaces.Contains(configs.NEWUSER) {
 		nsTypes = append(nsTypes, configs.NEWUSER)
 	}
 	for _, nsType := range nsTypes {

diff --git a/libcontainer/integration/exec_test.go b/libcontainer/integration/exec_test.go
@@ -916,3 +916,94 @@ func TestInitJoinPID(t *testing.T) {
 		t.Errorf("unexpected running process, output %q", out)
 	}
 }
+
+func TestInitJoinNetworkAndUser(t *testing.T) {
+	if _, err := os.Stat("/proc/self/ns/user"); os.IsNotExist(err) {
+		t.Skip("userns is unsupported")
+	}
+	if testing.Short() {
+		return
+	}
+	rootfs, err := newRootfs()
+	ok(t, err)
+	defer remove(rootfs)
+
+	// Execute a long-running container
+	config1 := newTemplateConfig(rootfs)
+	config1.UidMappings = []configs.IDMap{{0, 0, 1000}}
+	config1.GidMappings = []configs.IDMap{{0, 0, 1000}}
+	config1.Namespaces = append(config1.Namespaces, configs.Namespace{Type: configs.NEWUSER})
+	container1, err := newContainer(config1)
+	ok(t, err)
+	defer container1.Destroy()
+
+	stdinR1, stdinW1, err := os.Pipe()
+	ok(t, err)
+	init1 := &libcontainer.Process{
+		Args:  []string{"cat"},
+		Env:   standardEnvironment,
+		Stdin: stdinR1,
+	}
+	err = container1.Start(init1)
+	stdinR1.Close()
+	defer stdinW1.Close()
+	ok(t, err)
+
+	// get the state of the first container
+	state1, err := container1.State()
+	ok(t, err)
+	netns1 := state1.NamespacePaths[configs.NEWNET]
+	userns1 := state1.NamespacePaths[configs.NEWUSER]
+
+	// Start a container inside the existing pidns but with different cgroups
+	rootfs2, err := newRootfs()
+	ok(t, err)
+	defer remove(rootfs2)
+
+	config2 := newTemplateConfig(rootfs2)
+	config2.UidMappings = []configs.IDMap{{0, 0, 1000}}
+	config2.GidMappings = []configs.IDMap{{0, 0, 1000}}
+	config2.Namespaces.Add(configs.NEWNET, netns1)
+	config2.Namespaces.Add(configs.NEWUSER, userns1)
+	config2.Cgroups.Name = "test2"
+	container2, err := newContainerWithName("testCT2", config2)
+	ok(t, err)
+	defer container2.Destroy()
+
+	stdinR2, stdinW2, err := os.Pipe()
+	ok(t, err)
+	init2 := &libcontainer.Process{
+		Args:  []string{"cat"},
+		Env:   standardEnvironment,
+		Stdin: stdinR2,
+	}
+	err = container2.Start(init2)
+	stdinR2.Close()
+	defer stdinW2.Close()
+	ok(t, err)
+	// get the state of the second container
+	state2, err := container2.State()
+	ok(t, err)
+
+	for _, ns := range []string{"net", "user"} {
+		ns1, err := os.Readlink(fmt.Sprintf("/proc/%d/ns/%s", state1.InitProcessPid, ns))
+		ok(t, err)
+		ns2, err := os.Readlink(fmt.Sprintf("/proc/%d/ns/%s", state2.InitProcessPid, ns))
+		ok(t, err)
+		if ns1 != ns2 {
+			t.Errorf("%s(%s), wanted %s", ns, ns2, ns1)
+		}
+	}
+
+	// check that namespaces are not the same
+	if reflect.DeepEqual(state2.NamespacePaths, state1.NamespacePaths) {
+		t.Errorf("Namespaces(%v), original %v", state2.NamespacePaths,
+			state1.NamespacePaths)
+	}
+	// Stop init processes one by one. Stop the second container should
+	// not stop the first.
+	stdinW2.Close()
+	waitProcess(init2, t)
+	stdinW1.Close()
+	waitProcess(init1, t)
+}
diff --git a/libcontainer/nsenter/nsexec.c b/libcontainer/nsenter/nsexec.c
@@ -51,14 +51,15 @@ int setns(int fd, int nstype)
 #endif
 #endif
 
-static int clone_parent(jmp_buf * env) __attribute__ ((noinline));
-static int clone_parent(jmp_buf * env)
+static int clone_parent(jmp_buf * env, int flags) __attribute__ ((noinline));
+static int clone_parent(jmp_buf * env, int flags)
 {
 	struct clone_arg ca;
 	int child;
 
 	ca.env = env;
-	child = clone(child_func, ca.stack_ptr, CLONE_PARENT | SIGCHLD, &ca);
+	child = clone(child_func, ca.stack_ptr,
+		      CLONE_PARENT | SIGCHLD | flags, &ca);
 
 	return child;
 }
@@ -81,6 +82,7 @@ void nsexec()
 	jmp_buf env;
 	char buf[PATH_MAX], *val, *nspaths;
 	int nsLen, child, len, pipenum, consolefd = -1;
+	int cloneflags;
 	char *console;
 
 	// _LIBCONTAINER_NSPATH if exists is a comma-separated list of namespaces
@@ -89,6 +91,20 @@ void nsexec()
 	if (nspaths == NULL) {
 		return;
 	}
+	// _LIBCONTAINER_CLONEFLAGS is set when we want nsexec to setup namespaces
+	// after setns. Default to 0 which means namespaces will not be created
+	val = getenv("_LIBCONTAINER_CLONEFLAGS");
+	if (val == NULL) {
+		cloneflags = 0;
+	} else {
+		cloneflags = atoi(val);
+		snprintf(buf, sizeof(buf), "%d", cloneflags);
+		if (strcmp(val, buf)) {
+			pr_perror("Unable to parse _LIBCONTAINER_CLONEFLAGS");
+			exit(1);
+		}
+	}
+
 	// get the init pipe to communicate with parent
 	val = getenv("_LIBCONTAINER_INITPIPE");
 	if (val == NULL) {
@@ -178,7 +194,7 @@ void nsexec()
 	// We must fork to actually enter the PID namespace, use CLONE_PARENT
 	// so the child can have the right parent, and we don't need to forward
 	// the child's exit code or resend its death signal.
-	child = clone_parent(&env);
+	child = clone_parent(&env, cloneflags);
 	if (child < 0) {
 		pr_perror("Unable to fork");
 		exit(1);

diff --git a/libcontainer/process_linux.go b/libcontainer/process_linux.go
@@ -13,7 +13,6 @@ import (
 	"syscall"
 
 	"github.com/opencontainers/runc/libcontainer/cgroups"
-	"github.com/opencontainers/runc/libcontainer/configs"
 	"github.com/opencontainers/runc/libcontainer/system"
 )
 
@@ -167,10 +166,7 @@ type initProcess struct {
 	container  *linuxContainer
 	fds        []string
 
-	// joinNamespaces are additional namespaces that the init process will join
-	// instead of creating new ones
-	joinNamespaces configs.Namespaces
-	doClone        bool
+	doClone bool
 }
 
 func (p *initProcess) pid() int {
@@ -217,7 +213,7 @@ func (p *initProcess) start() error {
 		return newSystemError(err)
 	}
 	// if we need to clone a new child process
-	if len(p.joinNamespaces) > 0 && p.doClone {
+	if p.doClone {
 		if err := p.execSetns(); err != nil {
 			return newSystemError(err)
 		}