From 8ec02ea1b14edac85a6e9a2b622ae44e15e11ccd Mon Sep 17 00:00:00 2001 From: Kir Kolyshkin Date: Thu, 16 Mar 2023 09:52:26 -0700 Subject: [PATCH] nsexec: retry unshare on EINVAL Older kernels may return EINVAL on unshare when a process is reading runc's /proc/$PID/status or /proc/$PID/maps. This was fixed by kernel commit 12c641ab8270f ("unshare: Unsharing a thread does not require unsharing a vm") in Linux v4.3. For CentOS 7, the fix was backported to CentOS 7.7 (kernel 3.10.0-1062). To work around this kernel bug, let's retry on EINVAL a few times. Reported-by: zzyyzte Signed-off-by: Kir Kolyshkin (cherry picked from commit cecb039d24d9cef17cc37df9d7cd338b83436138) Signed-off-by: Kir Kolyshkin --- libcontainer/nsenter/nsexec.c | 30 ++++++++++++++++++++++-------- 1 file changed, 22 insertions(+), 8 deletions(-) diff --git a/libcontainer/nsenter/nsexec.c b/libcontainer/nsenter/nsexec.c index b12f2355c7a..2d224babea9 100644 --- a/libcontainer/nsenter/nsexec.c +++ b/libcontainer/nsenter/nsexec.c @@ -832,6 +832,25 @@ void send_mountsources(int sockfd, pid_t child, char *mountsources, size_t mount bail("failed to close container mount namespace fd %d", container_mntns_fd); } +void try_unshare(int flags, const char *msg) +{ + write_log(DEBUG, "unshare %s", msg); + /* + * Kernels prior to v4.3 may return EINVAL on unshare when another process + * reads runc's /proc/$PID/status or /proc/$PID/maps. To work around this, + * retry on EINVAL a few times. + */ + int retries = 5; + for (; retries > 0; retries--) { + if (unshare(flags) == 0) { + return; + } + if (errno != EINVAL) + break; + } + bail("failed to unshare %s", msg); +} + void nsexec(void) { int pipenum; @@ -1170,9 +1189,7 @@ void nsexec(void) * problem. */ if (config.cloneflags & CLONE_NEWUSER) { - write_log(DEBUG, "unshare user namespace"); - if (unshare(CLONE_NEWUSER) < 0) - bail("failed to unshare user namespace"); + try_unshare(CLONE_NEWUSER, "user namespace"); config.cloneflags &= ~CLONE_NEWUSER; /* @@ -1224,9 +1241,7 @@ void nsexec(void) * some old kernel versions where clone(CLONE_PARENT | CLONE_NEWPID) * was broken, so we'll just do it the long way anyway. */ - write_log(DEBUG, "unshare remaining namespace (except cgroupns)"); - if (unshare(config.cloneflags & ~CLONE_NEWCGROUP) < 0) - bail("failed to unshare remaining namespaces (except cgroupns)"); + try_unshare(config.cloneflags & ~CLONE_NEWCGROUP, "remaining namespaces (except cgroupns)"); /* Ask our parent to send the mount sources fds. */ if (config.mountsources) { @@ -1344,8 +1359,7 @@ void nsexec(void) } if (config.cloneflags & CLONE_NEWCGROUP) { - if (unshare(CLONE_NEWCGROUP) < 0) - bail("failed to unshare cgroup namespace"); + try_unshare(CLONE_NEWCGROUP, "cgroup namespace"); } write_log(DEBUG, "signal completion to stage-0");