diff --git a/Gopkg.lock b/Gopkg.lock index 663907c013..b16f86df47 100644 --- a/Gopkg.lock +++ b/Gopkg.lock @@ -1,14 +1,6 @@ # This file is autogenerated, do not edit; changes may be undone by the next 'dep ensure'. -[[projects]] - digest = "1:b54ea408c3a0a82d6d644fbb4468d00b50b1368482abc337aec72b60a3ea7c12" - name = "github.com/checkpoint-restore/go-criu" - packages = ["rpc"] - pruneopts = "NUT" - revision = "17b0214f6c48980c45dc47ecb0cfd6d9e02df723" - version = "v3.11" - [[projects]] branch = "master" digest = "1:8ecb89af7dfe3ac401bdb0c9390b134ef96a97e85f732d2b0604fb7b3977839f" @@ -141,7 +133,7 @@ revision = "7d4729fb36185a7c1719923406c9d40e54fb93c7" [[projects]] - digest = "1:b2eb2cc22a636b636ed6f57847c37ca065bddbe456303930193083edd36f1203" + digest = "1:e9efda6418044c653a9d6051719196214dbdbe4fa50ebcc7975dde8932b6d2f5" name = "github.com/opencontainers/runc" packages = [ "libcontainer", @@ -151,6 +143,7 @@ "libcontainer/cgroups/systemd", "libcontainer/configs", "libcontainer/configs/validate", + "libcontainer/criurpc", "libcontainer/intelrdt", "libcontainer/keys", "libcontainer/mount", @@ -163,7 +156,7 @@ "libcontainer/utils", ] pruneopts = "NUT" - revision = "f56b4cbeadc407e715d9b2ba49e62185bd81cef4" + revision = "cc4307ab6643668ce5abc6b524e1764a54c32550" [[projects]] digest = "1:0d447d4961f4f9270457fbc20d0261bba8d3056f395efd2e2480e2dfa4487a60" diff --git a/Gopkg.toml b/Gopkg.toml index 84167b528c..cf910debab 100644 --- a/Gopkg.toml +++ b/Gopkg.toml @@ -12,7 +12,7 @@ [[constraint]] name = "github.com/opencontainers/runc" - revision = "f56b4cbeadc407e715d9b2ba49e62185bd81cef4" + revision = "cc4307ab6643668ce5abc6b524e1764a54c32550" [[constraint]] name = "github.com/opencontainers/runtime-spec" diff --git a/vendor/github.com/checkpoint-restore/go-criu/LICENSE b/vendor/github.com/checkpoint-restore/go-criu/LICENSE deleted file mode 100644 index 8dada3edaf..0000000000 --- a/vendor/github.com/checkpoint-restore/go-criu/LICENSE +++ /dev/null @@ -1,201 +0,0 @@ - Apache License - Version 2.0, January 2004 - http://www.apache.org/licenses/ - - TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION - - 1. Definitions. - - "License" shall mean the terms and conditions for use, reproduction, - and distribution as defined by Sections 1 through 9 of this document. - - "Licensor" shall mean the copyright owner or entity authorized by - the copyright owner that is granting the License. - - "Legal Entity" shall mean the union of the acting entity and all - other entities that control, are controlled by, or are under common - control with that entity. For the purposes of this definition, - "control" means (i) the power, direct or indirect, to cause the - direction or management of such entity, whether by contract or - otherwise, or (ii) ownership of fifty percent (50%) or more of the - outstanding shares, or (iii) beneficial ownership of such entity. - - "You" (or "Your") shall mean an individual or Legal Entity - exercising permissions granted by this License. - - "Source" form shall mean the preferred form for making modifications, - including but not limited to software source code, documentation - source, and configuration files. - - "Object" form shall mean any form resulting from mechanical - transformation or translation of a Source form, including but - not limited to compiled object code, generated documentation, - and conversions to other media types. - - "Work" shall mean the work of authorship, whether in Source or - Object form, made available under the License, as indicated by a - copyright notice that is included in or attached to the work - (an example is provided in the Appendix below). - - "Derivative Works" shall mean any work, whether in Source or Object - form, that is based on (or derived from) the Work and for which the - editorial revisions, annotations, elaborations, or other modifications - represent, as a whole, an original work of authorship. For the purposes - of this License, Derivative Works shall not include works that remain - separable from, or merely link (or bind by name) to the interfaces of, - the Work and Derivative Works thereof. - - "Contribution" shall mean any work of authorship, including - the original version of the Work and any modifications or additions - to that Work or Derivative Works thereof, that is intentionally - submitted to Licensor for inclusion in the Work by the copyright owner - or by an individual or Legal Entity authorized to submit on behalf of - the copyright owner. For the purposes of this definition, "submitted" - means any form of electronic, verbal, or written communication sent - to the Licensor or its representatives, including but not limited to - communication on electronic mailing lists, source code control systems, - and issue tracking systems that are managed by, or on behalf of, the - Licensor for the purpose of discussing and improving the Work, but - excluding communication that is conspicuously marked or otherwise - designated in writing by the copyright owner as "Not a Contribution." - - "Contributor" shall mean Licensor and any individual or Legal Entity - on behalf of whom a Contribution has been received by Licensor and - subsequently incorporated within the Work. - - 2. Grant of Copyright License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - copyright license to reproduce, prepare Derivative Works of, - publicly display, publicly perform, sublicense, and distribute the - Work and such Derivative Works in Source or Object form. - - 3. Grant of Patent License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - (except as stated in this section) patent license to make, have made, - use, offer to sell, sell, import, and otherwise transfer the Work, - where such license applies only to those patent claims licensable - by such Contributor that are necessarily infringed by their - Contribution(s) alone or by combination of their Contribution(s) - with the Work to which such Contribution(s) was submitted. If You - institute patent litigation against any entity (including a - cross-claim or counterclaim in a lawsuit) alleging that the Work - or a Contribution incorporated within the Work constitutes direct - or contributory patent infringement, then any patent licenses - granted to You under this License for that Work shall terminate - as of the date such litigation is filed. - - 4. Redistribution. You may reproduce and distribute copies of the - Work or Derivative Works thereof in any medium, with or without - modifications, and in Source or Object form, provided that You - meet the following conditions: - - (a) You must give any other recipients of the Work or - Derivative Works a copy of this License; and - - (b) You must cause any modified files to carry prominent notices - stating that You changed the files; and - - (c) You must retain, in the Source form of any Derivative Works - that You distribute, all copyright, patent, trademark, and - attribution notices from the Source form of the Work, - excluding those notices that do not pertain to any part of - the Derivative Works; and - - (d) If the Work includes a "NOTICE" text file as part of its - distribution, then any Derivative Works that You distribute must - include a readable copy of the attribution notices contained - within such NOTICE file, excluding those notices that do not - pertain to any part of the Derivative Works, in at least one - of the following places: within a NOTICE text file distributed - as part of the Derivative Works; within the Source form or - documentation, if provided along with the Derivative Works; or, - within a display generated by the Derivative Works, if and - wherever such third-party notices normally appear. The contents - of the NOTICE file are for informational purposes only and - do not modify the License. You may add Your own attribution - notices within Derivative Works that You distribute, alongside - or as an addendum to the NOTICE text from the Work, provided - that such additional attribution notices cannot be construed - as modifying the License. - - You may add Your own copyright statement to Your modifications and - may provide additional or different license terms and conditions - for use, reproduction, or distribution of Your modifications, or - for any such Derivative Works as a whole, provided Your use, - reproduction, and distribution of the Work otherwise complies with - the conditions stated in this License. - - 5. Submission of Contributions. Unless You explicitly state otherwise, - any Contribution intentionally submitted for inclusion in the Work - by You to the Licensor shall be under the terms and conditions of - this License, without any additional terms or conditions. - Notwithstanding the above, nothing herein shall supersede or modify - the terms of any separate license agreement you may have executed - with Licensor regarding such Contributions. - - 6. Trademarks. This License does not grant permission to use the trade - names, trademarks, service marks, or product names of the Licensor, - except as required for reasonable and customary use in describing the - origin of the Work and reproducing the content of the NOTICE file. - - 7. Disclaimer of Warranty. Unless required by applicable law or - agreed to in writing, Licensor provides the Work (and each - Contributor provides its Contributions) on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or - implied, including, without limitation, any warranties or conditions - of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A - PARTICULAR PURPOSE. You are solely responsible for determining the - appropriateness of using or redistributing the Work and assume any - risks associated with Your exercise of permissions under this License. - - 8. Limitation of Liability. In no event and under no legal theory, - whether in tort (including negligence), contract, or otherwise, - unless required by applicable law (such as deliberate and grossly - negligent acts) or agreed to in writing, shall any Contributor be - liable to You for damages, including any direct, indirect, special, - incidental, or consequential damages of any character arising as a - result of this License or out of the use or inability to use the - Work (including but not limited to damages for loss of goodwill, - work stoppage, computer failure or malfunction, or any and all - other commercial damages or losses), even if such Contributor - has been advised of the possibility of such damages. - - 9. Accepting Warranty or Additional Liability. While redistributing - the Work or Derivative Works thereof, You may choose to offer, - and charge a fee for, acceptance of support, warranty, indemnity, - or other liability obligations and/or rights consistent with this - License. However, in accepting such obligations, You may act only - on Your own behalf and on Your sole responsibility, not on behalf - of any other Contributor, and only if You agree to indemnify, - defend, and hold each Contributor harmless for any liability - incurred by, or claims asserted against, such Contributor by reason - of your accepting any such warranty or additional liability. - - END OF TERMS AND CONDITIONS - - APPENDIX: How to apply the Apache License to your work. - - To apply the Apache License to your work, attach the following - boilerplate notice, with the fields enclosed by brackets "{}" - replaced with your own identifying information. (Don't include - the brackets!) The text should be enclosed in the appropriate - comment syntax for the file format. We also recommend that a - file or class name and description of purpose be included on the - same "printed page" as the copyright notice for easier - identification within third-party archives. - - Copyright {yyyy} {name of copyright owner} - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/apply_raw.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/apply_raw.go index f672ba2737..6d9123dc26 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/apply_raw.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/apply_raw.go @@ -3,6 +3,7 @@ package fs import ( + "errors" "fmt" "io" "io/ioutil" @@ -13,8 +14,6 @@ import ( "github.com/opencontainers/runc/libcontainer/cgroups" "github.com/opencontainers/runc/libcontainer/configs" libcontainerUtils "github.com/opencontainers/runc/libcontainer/utils" - "github.com/pkg/errors" - "golang.org/x/sys/unix" ) var ( @@ -36,7 +35,7 @@ var ( HugePageSizes, _ = cgroups.GetHugePageSize() ) -var errSubsystemDoesNotExist = fmt.Errorf("cgroup: subsystem does not exist") +var errSubsystemDoesNotExist = errors.New("cgroup: subsystem does not exist") type subsystemSet []subsystem @@ -63,10 +62,9 @@ type subsystem interface { } type Manager struct { - mu sync.Mutex - Cgroups *configs.Cgroup - Rootless bool // ignore permission-related errors - Paths map[string]string + mu sync.Mutex + Cgroups *configs.Cgroup + Paths map[string]string } // The absolute path to the root of the cgroup hierarchies. @@ -102,33 +100,6 @@ type cgroupData struct { pid int } -// isIgnorableError returns whether err is a permission error (in the loose -// sense of the word). This includes EROFS (which for an unprivileged user is -// basically a permission error) and EACCES (for similar reasons) as well as -// the normal EPERM. -func isIgnorableError(rootless bool, err error) bool { - // We do not ignore errors if we are root. - if !rootless { - return false - } - // Is it an ordinary EPERM? - if os.IsPermission(errors.Cause(err)) { - return true - } - - // Try to handle other errnos. - var errno error - switch err := errors.Cause(err).(type) { - case *os.PathError: - errno = err.Err - case *os.LinkError: - errno = err.Err - case *os.SyscallError: - errno = err.Err - } - return errno == unix.EROFS || errno == unix.EPERM || errno == unix.EACCES -} - func (m *Manager) Apply(pid int) (err error) { if m.Cgroups == nil { return nil @@ -174,11 +145,11 @@ func (m *Manager) Apply(pid int) (err error) { m.Paths[sys.Name()] = p if err := sys.Apply(d); err != nil { - // In the case of rootless (including euid=0 in userns), where an explicit cgroup path hasn't - // been set, we don't bail on error in case of permission problems. - // Cases where limits have been set (and we couldn't create our own - // cgroup) are handled by Set. - if isIgnorableError(m.Rootless, err) && m.Cgroups.Path == "" { + if os.IsPermission(err) && m.Cgroups.Path == "" { + // If we didn't set a cgroup path, then let's defer the error here + // until we know whether we have set limits or not. + // If we hadn't set limits, then it's ok that we couldn't join this cgroup, because + // it will have the same limits as its parent. delete(m.Paths, sys.Name()) continue } @@ -236,16 +207,9 @@ func (m *Manager) Set(container *configs.Config) error { for _, sys := range subsystems { path := paths[sys.Name()] if err := sys.Set(path, container.Cgroups); err != nil { - if m.Rootless && sys.Name() == "devices" { - continue - } - // When m.Rootless is true, errors from the device subsystem are ignored because it is really not expected to work. - // However, errors from other subsystems are not ignored. - // see @test "runc create (rootless + limits + no cgrouppath + no permission) fails with informative error" if path == "" { - // We never created a path for this cgroup, so we cannot set - // limits for it (though we have already tried at this point). - return fmt.Errorf("cannot set %s limit: container could not join or create cgroup", sys.Name()) + // cgroup never applied + return fmt.Errorf("cannot set limits on the %s cgroup, as the container has not joined it", sys.Name()) } return err } @@ -317,7 +281,7 @@ func getCgroupData(c *configs.Cgroup, pid int) (*cgroupData, error) { } func (raw *cgroupData) path(subsystem string) (string, error) { - mnt, err := cgroups.FindCgroupMountpoint(raw.root, subsystem) + mnt, err := cgroups.FindCgroupMountpoint(subsystem) // If we didn't mount the subsystem, there is no point we make the path. if err != nil { return "", err diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/cpu.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/cpu.go index e240a8313a..b712bd0b1e 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/cpu.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/cpu.go @@ -46,7 +46,11 @@ func (s *CpuGroup) ApplyDir(path string, cgroup *configs.Cgroup, pid int) error } // because we are not using d.join we need to place the pid into the procs file // unlike the other subsystems - return cgroups.WriteCgroupProc(path, pid) + if err := cgroups.WriteCgroupProc(path, pid); err != nil { + return err + } + + return nil } func (s *CpuGroup) SetRtSched(path string, cgroup *configs.Cgroup) error { @@ -79,7 +83,11 @@ func (s *CpuGroup) Set(path string, cgroup *configs.Cgroup) error { return err } } - return s.SetRtSched(path, cgroup) + if err := s.SetRtSched(path, cgroup); err != nil { + return err + } + + return nil } func (s *CpuGroup) Remove(d *cgroupData) error { diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/cpuset.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/cpuset.go index 5a1d152ea1..20c9eafac2 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/cpuset.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/cpuset.go @@ -77,14 +77,18 @@ func (s *CpusetGroup) ApplyDir(dir string, cgroup *configs.Cgroup, pid int) erro // The logic is, if user specified cpuset configs, use these // specified configs, otherwise, inherit from parent. This makes // cpuset configs work correctly with 'cpuset.cpu_exclusive', and - // keep backward compatibility. + // keep backward compatbility. if err := s.ensureCpusAndMems(dir, cgroup); err != nil { return err } // because we are not using d.join we need to place the pid into the procs file // unlike the other subsystems - return cgroups.WriteCgroupProc(dir, pid) + if err := cgroups.WriteCgroupProc(dir, pid); err != nil { + return err + } + + return nil } func (s *CpusetGroup) getSubsystemSettings(parent string) (cpus []byte, mems []byte, err error) { diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/kmem.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/kmem.go deleted file mode 100644 index 69b5a1946c..0000000000 --- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/kmem.go +++ /dev/null @@ -1,62 +0,0 @@ -// +build linux,!nokmem - -package fs - -import ( - "errors" - "fmt" - "io/ioutil" - "os" - "path/filepath" - "strconv" - "syscall" // for Errno type only - - "github.com/opencontainers/runc/libcontainer/cgroups" - "golang.org/x/sys/unix" -) - -const cgroupKernelMemoryLimit = "memory.kmem.limit_in_bytes" - -func EnableKernelMemoryAccounting(path string) error { - // Ensure that kernel memory is available in this kernel build. If it - // isn't, we just ignore it because EnableKernelMemoryAccounting is - // automatically called for all memory limits. - if !cgroups.PathExists(filepath.Join(path, cgroupKernelMemoryLimit)) { - return nil - } - // We have to limit the kernel memory here as it won't be accounted at all - // until a limit is set on the cgroup and limit cannot be set once the - // cgroup has children, or if there are already tasks in the cgroup. - for _, i := range []int64{1, -1} { - if err := setKernelMemory(path, i); err != nil { - return err - } - } - return nil -} - -func setKernelMemory(path string, kernelMemoryLimit int64) error { - if path == "" { - return fmt.Errorf("no such directory for %s", cgroupKernelMemoryLimit) - } - if !cgroups.PathExists(filepath.Join(path, cgroupKernelMemoryLimit)) { - // We have specifically been asked to set a kmem limit. If the kernel - // doesn't support it we *must* error out. - return errors.New("kernel memory accounting not supported by this kernel") - } - if err := ioutil.WriteFile(filepath.Join(path, cgroupKernelMemoryLimit), []byte(strconv.FormatInt(kernelMemoryLimit, 10)), 0700); err != nil { - // Check if the error number returned by the syscall is "EBUSY" - // The EBUSY signal is returned on attempts to write to the - // memory.kmem.limit_in_bytes file if the cgroup has children or - // once tasks have been attached to the cgroup - if pathErr, ok := err.(*os.PathError); ok { - if errNo, ok := pathErr.Err.(syscall.Errno); ok { - if errNo == unix.EBUSY { - return fmt.Errorf("failed to set %s, because either tasks have already joined this cgroup or it has children", cgroupKernelMemoryLimit) - } - } - } - return fmt.Errorf("failed to write %v to %v: %v", kernelMemoryLimit, cgroupKernelMemoryLimit, err) - } - return nil -} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/kmem_disabled.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/kmem_disabled.go deleted file mode 100644 index ac290fd7a0..0000000000 --- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/kmem_disabled.go +++ /dev/null @@ -1,15 +0,0 @@ -// +build linux,nokmem - -package fs - -import ( - "errors" -) - -func EnableKernelMemoryAccounting(path string) error { - return nil -} - -func setKernelMemory(path string, kernelMemoryLimit int64) error { - return errors.New("kernel memory accounting disabled in this runc build") -} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/memory.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/memory.go index d5310d569f..ad395a5d62 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/memory.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/memory.go @@ -5,18 +5,23 @@ package fs import ( "bufio" "fmt" + "io/ioutil" "os" "path/filepath" "strconv" "strings" + "syscall" // only for Errno "github.com/opencontainers/runc/libcontainer/cgroups" "github.com/opencontainers/runc/libcontainer/configs" + + "golang.org/x/sys/unix" ) const ( - cgroupMemorySwapLimit = "memory.memsw.limit_in_bytes" - cgroupMemoryLimit = "memory.limit_in_bytes" + cgroupKernelMemoryLimit = "memory.kmem.limit_in_bytes" + cgroupMemorySwapLimit = "memory.memsw.limit_in_bytes" + cgroupMemoryLimit = "memory.limit_in_bytes" ) type MemoryGroup struct { @@ -62,6 +67,44 @@ func (s *MemoryGroup) Apply(d *cgroupData) (err error) { return nil } +func EnableKernelMemoryAccounting(path string) error { + // Check if kernel memory is enabled + // We have to limit the kernel memory here as it won't be accounted at all + // until a limit is set on the cgroup and limit cannot be set once the + // cgroup has children, or if there are already tasks in the cgroup. + for _, i := range []int64{1, -1} { + if err := setKernelMemory(path, i); err != nil { + return err + } + } + return nil +} + +func setKernelMemory(path string, kernelMemoryLimit int64) error { + if path == "" { + return fmt.Errorf("no such directory for %s", cgroupKernelMemoryLimit) + } + if !cgroups.PathExists(filepath.Join(path, cgroupKernelMemoryLimit)) { + // kernel memory is not enabled on the system so we should do nothing + return nil + } + if err := ioutil.WriteFile(filepath.Join(path, cgroupKernelMemoryLimit), []byte(strconv.FormatInt(kernelMemoryLimit, 10)), 0700); err != nil { + // Check if the error number returned by the syscall is "EBUSY" + // The EBUSY signal is returned on attempts to write to the + // memory.kmem.limit_in_bytes file if the cgroup has children or + // once tasks have been attached to the cgroup + if pathErr, ok := err.(*os.PathError); ok { + if errNo, ok := pathErr.Err.(syscall.Errno); ok { + if errNo == unix.EBUSY { + return fmt.Errorf("failed to set %s, because either tasks have already joined this cgroup or it has children", cgroupKernelMemoryLimit) + } + } + } + return fmt.Errorf("failed to write %v to %v: %v", kernelMemoryLimit, cgroupKernelMemoryLimit, err) + } + return nil +} + func setMemoryAndSwap(path string, cgroup *configs.Cgroup) error { // If the memory update is set to -1 we should also // set swap to -1, it means unlimited memory. diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/systemd/apply_systemd.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/systemd/apply_systemd.go index a10e3f6a89..efadc2a3ca 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/systemd/apply_systemd.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/systemd/apply_systemd.go @@ -5,8 +5,6 @@ package systemd import ( "errors" "fmt" - "io/ioutil" - "math" "os" "path/filepath" "strings" @@ -72,11 +70,12 @@ const ( ) var ( - connLock sync.Mutex - theConn *systemdDbus.Conn - hasStartTransientUnit bool - hasStartTransientSliceUnit bool - hasDelegateSlice bool + connLock sync.Mutex + theConn *systemdDbus.Conn + hasStartTransientUnit bool + hasStartTransientSliceUnit bool + hasTransientDefaultDependencies bool + hasDelegate bool ) func newProp(name string, units interface{}) systemdDbus.Property { @@ -114,6 +113,53 @@ func UseSystemd() bool { } } + // Ensure the scope name we use doesn't exist. Use the Pid to + // avoid collisions between multiple libcontainer users on a + // single host. + scope := fmt.Sprintf("libcontainer-%d-systemd-test-default-dependencies.scope", os.Getpid()) + testScopeExists := true + for i := 0; i <= testScopeWait; i++ { + if _, err := theConn.StopUnit(scope, "replace", nil); err != nil { + if dbusError, ok := err.(dbus.Error); ok { + if strings.Contains(dbusError.Name, "org.freedesktop.systemd1.NoSuchUnit") { + testScopeExists = false + break + } + } + } + time.Sleep(time.Millisecond) + } + + // Bail out if we can't kill this scope without testing for DefaultDependencies + if testScopeExists { + return hasStartTransientUnit + } + + // Assume StartTransientUnit on a scope allows DefaultDependencies + hasTransientDefaultDependencies = true + ddf := newProp("DefaultDependencies", false) + if _, err := theConn.StartTransientUnit(scope, "replace", []systemdDbus.Property{ddf}, nil); err != nil { + if dbusError, ok := err.(dbus.Error); ok { + if strings.Contains(dbusError.Name, "org.freedesktop.DBus.Error.PropertyReadOnly") { + hasTransientDefaultDependencies = false + } + } + } + + // Not critical because of the stop unit logic above. + theConn.StopUnit(scope, "replace", nil) + + // Assume StartTransientUnit on a scope allows Delegate + hasDelegate = true + dl := newProp("Delegate", true) + if _, err := theConn.StartTransientUnit(scope, "replace", []systemdDbus.Property{dl}, nil); err != nil { + if dbusError, ok := err.(dbus.Error); ok { + if strings.Contains(dbusError.Name, "org.freedesktop.DBus.Error.PropertyReadOnly") { + hasDelegate = false + } + } + } + // Assume we have the ability to start a transient unit as a slice // This was broken until systemd v229, but has been back-ported on RHEL environments >= 219 // For details, see: https://bugzilla.redhat.com/show_bug.cgi?id=1370299 @@ -142,22 +188,7 @@ func UseSystemd() bool { } // Not critical because of the stop unit logic above. - theConn.StopUnit(slice, "replace", nil) - - // Assume StartTransientUnit on a slice allows Delegate - hasDelegateSlice = true - dlSlice := newProp("Delegate", true) - if _, err := theConn.StartTransientUnit(slice, "replace", []systemdDbus.Property{dlSlice}, nil); err != nil { - if dbusError, ok := err.(dbus.Error); ok { - // Starting with systemd v237, Delegate is not even a property of slices anymore, - // so the D-Bus call fails with "InvalidArgs" error. - if strings.Contains(dbusError.Name, "org.freedesktop.DBus.Error.PropertyReadOnly") || strings.Contains(dbusError.Name, "org.freedesktop.DBus.Error.InvalidArgs") { - hasDelegateSlice = false - } - } - } - - // Not critical because of the stop unit logic above. + theConn.StopUnit(scope, "replace", nil) theConn.StopUnit(slice, "replace", nil) } return hasStartTransientUnit @@ -211,14 +242,8 @@ func (m *Manager) Apply(pid int) error { properties = append(properties, newProp("PIDs", []uint32{uint32(pid)})) } - // Check if we can delegate. This is only supported on systemd versions 218 and above. - if strings.HasSuffix(unitName, ".slice") { - if hasDelegateSlice { - // systemd 237 and above no longer allows delegation on a slice - properties = append(properties, newProp("Delegate", true)) - } - } else { - // Assume scopes always support delegation. + if hasDelegate { + // This is only supported on systemd versions 218 and above. properties = append(properties, newProp("Delegate", true)) } @@ -229,9 +254,10 @@ func (m *Manager) Apply(pid int) error { newProp("CPUAccounting", true), newProp("BlockIOAccounting", true)) - // Assume DefaultDependencies= will always work (the check for it was previously broken.) - properties = append(properties, - newProp("DefaultDependencies", false)) + if hasTransientDefaultDependencies { + properties = append(properties, + newProp("DefaultDependencies", false)) + } if c.Resources.Memory != 0 { properties = append(properties, @@ -245,19 +271,13 @@ func (m *Manager) Apply(pid int) error { // cpu.cfs_quota_us and cpu.cfs_period_us are controlled by systemd. if c.Resources.CpuQuota != 0 && c.Resources.CpuPeriod != 0 { - // corresponds to USEC_INFINITY in systemd - // if USEC_INFINITY is provided, CPUQuota is left unbound by systemd - // always setting a property value ensures we can apply a quota and remove it later - cpuQuotaPerSecUSec := uint64(math.MaxUint64) - if c.Resources.CpuQuota > 0 { - // systemd converts CPUQuotaPerSecUSec (microseconds per CPU second) to CPUQuota - // (integer percentage of CPU) internally. This means that if a fractional percent of - // CPU is indicated by Resources.CpuQuota, we need to round up to the nearest - // 10ms (1% of a second) such that child cgroups can set the cpu.cfs_quota_us they expect. - cpuQuotaPerSecUSec = uint64(c.Resources.CpuQuota*1000000) / c.Resources.CpuPeriod - if cpuQuotaPerSecUSec%10000 != 0 { - cpuQuotaPerSecUSec = ((cpuQuotaPerSecUSec / 10000) + 1) * 10000 - } + cpuQuotaPerSecUSec := uint64(c.Resources.CpuQuota*1000000) / c.Resources.CpuPeriod + // systemd converts CPUQuotaPerSecUSec (microseconds per CPU second) to CPUQuota + // (integer percentage of CPU) internally. This means that if a fractional percent of + // CPU is indicated by Resources.CpuQuota, we need to round up to the nearest + // 10ms (1% of a second) such that child cgroups can set the cpu.cfs_quota_us they expect. + if cpuQuotaPerSecUSec%10000 != 0 { + cpuQuotaPerSecUSec = ((cpuQuotaPerSecUSec / 10000) + 1) * 10000 } properties = append(properties, newProp("CPUQuotaPerSecUSec", cpuQuotaPerSecUSec)) @@ -268,12 +288,6 @@ func (m *Manager) Apply(pid int) error { newProp("BlockIOWeight", uint64(c.Resources.BlkioWeight))) } - if c.Resources.PidsLimit > 0 { - properties = append(properties, - newProp("TasksAccounting", true), - newProp("TasksMax", uint64(c.Resources.PidsLimit))) - } - // We have to set kernel memory here, as we can't change it once // processes have been attached to the cgroup. if c.Resources.KernelMemory != 0 { @@ -282,17 +296,17 @@ func (m *Manager) Apply(pid int) error { } } - statusChan := make(chan string, 1) - if _, err := theConn.StartTransientUnit(unitName, "replace", properties, statusChan); err == nil { - select { - case <-statusChan: - case <-time.After(time.Second): - logrus.Warnf("Timed out while waiting for StartTransientUnit(%s) completion signal from dbus. Continuing...", unitName) - } - } else if !isUnitExists(err) { + statusChan := make(chan string) + if _, err := theConn.StartTransientUnit(unitName, "replace", properties, statusChan); err != nil && !isUnitExists(err) { return err } + select { + case <-statusChan: + case <-time.After(time.Second): + logrus.Warnf("Timed out while waiting for StartTransientUnit completion signal from dbus. Continuing...") + } + if err := joinCgroups(c, pid); err != nil { return err } @@ -419,7 +433,7 @@ func ExpandSlice(slice string) (string, error) { } func getSubsystemPath(c *configs.Cgroup, subsystem string) (string, error) { - mountpoint, err := cgroups.FindCgroupMountpoint(c.Path, subsystem) + mountpoint, err := cgroups.FindCgroupMountpoint(subsystem) if err != nil { return "", err } @@ -539,15 +553,6 @@ func setKernelMemory(c *configs.Cgroup) error { if err := os.MkdirAll(path, 0755); err != nil { return err } - // do not try to enable the kernel memory if we already have - // tasks in the cgroup. - content, err := ioutil.ReadFile(filepath.Join(path, "tasks")) - if err != nil { - return err - } - if len(content) > 0 { - return nil - } return fs.EnableKernelMemoryAccounting(path) } diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/utils.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/utils.go index 9717acc729..7c995efee5 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/utils.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/utils.go @@ -13,51 +13,40 @@ import ( "strings" "time" - units "github.com/docker/go-units" - "golang.org/x/sys/unix" + "github.com/docker/go-units" ) const ( - CgroupNamePrefix = "name=" + cgroupNamePrefix = "name=" CgroupProcesses = "cgroup.procs" ) // https://www.kernel.org/doc/Documentation/cgroup-v1/cgroups.txt -func FindCgroupMountpoint(cgroupPath, subsystem string) (string, error) { - mnt, _, err := FindCgroupMountpointAndRoot(cgroupPath, subsystem) +func FindCgroupMountpoint(subsystem string) (string, error) { + mnt, _, err := FindCgroupMountpointAndRoot(subsystem) return mnt, err } -func FindCgroupMountpointAndRoot(cgroupPath, subsystem string) (string, string, error) { +func FindCgroupMountpointAndRoot(subsystem string) (string, string, error) { // We are not using mount.GetMounts() because it's super-inefficient, // parsing it directly sped up x10 times because of not using Sscanf. // It was one of two major performance drawbacks in container start. if !isSubsystemAvailable(subsystem) { return "", "", NewNotFoundError(subsystem) } - f, err := os.Open("/proc/self/mountinfo") if err != nil { return "", "", err } defer f.Close() - return findCgroupMountpointAndRootFromReader(f, cgroupPath, subsystem) -} - -func findCgroupMountpointAndRootFromReader(reader io.Reader, cgroupPath, subsystem string) (string, string, error) { - scanner := bufio.NewScanner(reader) + scanner := bufio.NewScanner(f) for scanner.Scan() { txt := scanner.Text() - fields := strings.Fields(txt) - if len(fields) < 5 { - continue - } - if strings.HasPrefix(fields[4], cgroupPath) { - for _, opt := range strings.Split(fields[len(fields)-1], ",") { - if opt == subsystem { - return fields[4], fields[3], nil - } + fields := strings.Split(txt, " ") + for _, opt := range strings.Split(fields[len(fields)-1], ",") { + if opt == subsystem { + return fields[4], fields[3], nil } } } @@ -114,7 +103,7 @@ func FindCgroupMountpointDir() (string, error) { } if postSeparatorFields[0] == "cgroup" { - // Check that the mount is properly formatted. + // Check that the mount is properly formated. if numPostFields < 3 { return "", fmt.Errorf("Error found less than 3 fields post '-' in %q", text) } @@ -162,20 +151,19 @@ func getCgroupMountsHelper(ss map[string]bool, mi io.Reader, all bool) ([]Mount, Root: fields[3], } for _, opt := range strings.Split(fields[len(fields)-1], ",") { - seen, known := ss[opt] - if !known || (!all && seen) { + if !ss[opt] { continue } - ss[opt] = true - if strings.HasPrefix(opt, CgroupNamePrefix) { - opt = opt[len(CgroupNamePrefix):] + if strings.HasPrefix(opt, cgroupNamePrefix) { + m.Subsystems = append(m.Subsystems, opt[len(cgroupNamePrefix):]) + } else { + m.Subsystems = append(m.Subsystems, opt) + } + if !all { + numFound++ } - m.Subsystems = append(m.Subsystems, opt) - numFound++ - } - if len(m.Subsystems) > 0 || all { - res = append(res, m) } + res = append(res, m) } if err := scanner.Err(); err != nil { return nil, err @@ -199,7 +187,7 @@ func GetCgroupMounts(all bool) ([]Mount, error) { allMap := make(map[string]bool) for s := range allSubsystems { - allMap[s] = false + allMap[s] = true } return getCgroupMountsHelper(allMap, f, all) } @@ -268,13 +256,13 @@ func GetInitCgroupPath(subsystem string) (string, error) { } func getCgroupPathHelper(subsystem, cgroup string) (string, error) { - mnt, root, err := FindCgroupMountpointAndRoot("", subsystem) + mnt, root, err := FindCgroupMountpointAndRoot(subsystem) if err != nil { return "", err } // This is needed for nested containers, because in /proc/self/cgroup we - // see paths from host, which don't exist in container. + // see pathes from host, which don't exist in container. relCgroup, err := filepath.Rel(root, cgroup) if err != nil { return "", err @@ -354,7 +342,7 @@ func getControllerPath(subsystem string, cgroups map[string]string) (string, err return p, nil } - if p, ok := cgroups[CgroupNamePrefix+subsystem]; ok { + if p, ok := cgroups[cgroupNamePrefix+subsystem]; ok { return p, nil } @@ -465,39 +453,10 @@ func WriteCgroupProc(dir string, pid int) error { } // Dont attach any pid to the cgroup if -1 is specified as a pid - if pid == -1 { - return nil - } - - cgroupProcessesFile, err := os.OpenFile(filepath.Join(dir, CgroupProcesses), os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0700) - if err != nil { - return fmt.Errorf("failed to write %v to %v: %v", pid, CgroupProcesses, err) - } - defer cgroupProcessesFile.Close() - - for i := 0; i < 5; i++ { - _, err = cgroupProcessesFile.WriteString(strconv.Itoa(pid)) - if err == nil { - return nil - } - - // EINVAL might mean that the task being added to cgroup.procs is in state - // TASK_NEW. We should attempt to do so again. - if isEINVAL(err) { - time.Sleep(30 * time.Millisecond) - continue + if pid != -1 { + if err := ioutil.WriteFile(filepath.Join(dir, CgroupProcesses), []byte(strconv.Itoa(pid)), 0700); err != nil { + return fmt.Errorf("failed to write %v to %v: %v", pid, CgroupProcesses, err) } - - return fmt.Errorf("failed to write %v to %v: %v", pid, CgroupProcesses, err) - } - return err -} - -func isEINVAL(err error) bool { - switch err := err.(type) { - case *os.PathError: - return err.Err == unix.EINVAL - default: - return false } + return nil } diff --git a/vendor/github.com/opencontainers/runc/libcontainer/configs/config.go b/vendor/github.com/opencontainers/runc/libcontainer/configs/config.go index 7728522fef..3cae4fd8d9 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/configs/config.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/configs/config.go @@ -141,10 +141,9 @@ type Config struct { // OomScoreAdj specifies the adjustment to be made by the kernel when calculating oom scores // for a process. Valid values are between the range [-1000, '1000'], where processes with - // higher scores are preferred for being killed. If it is unset then we don't touch the current - // value. + // higher scores are preferred for being killed. // More information about kernel oom score calculation here: https://lwn.net/Articles/317814/ - OomScoreAdj *int `json:"oom_score_adj,omitempty"` + OomScoreAdj int `json:"oom_score_adj"` // UidMappings is an array of User ID mappings for User Namespaces UidMappings []IDMap `json:"uid_mappings"` @@ -186,19 +185,12 @@ type Config struct { // callers keyring in this case. NoNewKeyring bool `json:"no_new_keyring"` - // IntelRdt specifies settings for Intel RDT group that the container is placed into - // to limit the resources (e.g., L3 cache, memory bandwidth) the container has available - IntelRdt *IntelRdt `json:"intel_rdt,omitempty"` - - // RootlessEUID is set when the runc was launched with non-zero EUID. - // Note that RootlessEUID is set to false when launched with EUID=0 in userns. - // When RootlessEUID is set, runc creates a new userns for the container. - // (config.json needs to contain userns settings) - RootlessEUID bool `json:"rootless_euid,omitempty"` + // Rootless specifies whether the container is a rootless container. + Rootless bool `json:"rootless"` - // RootlessCgroups is set when unlikely to have the full access to cgroups. - // When RootlessCgroups is set, cgroups errors are ignored. - RootlessCgroups bool `json:"rootless_cgroups,omitempty"` + // IntelRdt specifies settings for Intel RDT/CAT group that the container is placed into + // to limit the resources (e.g., L3 cache) the container has available + IntelRdt *IntelRdt `json:"intel_rdt,omitempty"` } type Hooks struct { @@ -272,23 +264,26 @@ func (hooks Hooks) MarshalJSON() ([]byte, error) { }) } +// HookState is the payload provided to a hook on execution. +type HookState specs.State + type Hook interface { // Run executes the hook with the provided state. - Run(*specs.State) error + Run(HookState) error } // NewFunctionHook will call the provided function when the hook is run. -func NewFunctionHook(f func(*specs.State) error) FuncHook { +func NewFunctionHook(f func(HookState) error) FuncHook { return FuncHook{ run: f, } } type FuncHook struct { - run func(*specs.State) error + run func(HookState) error } -func (f FuncHook) Run(s *specs.State) error { +func (f FuncHook) Run(s HookState) error { return f.run(s) } @@ -311,7 +306,7 @@ type CommandHook struct { Command } -func (c Command) Run(s *specs.State) error { +func (c Command) Run(s HookState) error { b, err := json.Marshal(s) if err != nil { return err diff --git a/vendor/github.com/opencontainers/runc/libcontainer/configs/intelrdt.go b/vendor/github.com/opencontainers/runc/libcontainer/configs/intelrdt.go index 57e9f037d9..36bd5f96a1 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/configs/intelrdt.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/configs/intelrdt.go @@ -4,10 +4,4 @@ type IntelRdt struct { // The schema for L3 cache id and capacity bitmask (CBM) // Format: "L3:=;=;..." L3CacheSchema string `json:"l3_cache_schema,omitempty"` - - // The schema of memory bandwidth per L3 cache id - // Format: "MB:=bandwidth0;=bandwidth1;..." - // The unit of memory bandwidth is specified in "percentages" by - // default, and in "MBps" if MBA Software Controller is enabled. - MemBwSchema string `json:"memBwSchema,omitempty"` } diff --git a/vendor/github.com/opencontainers/runc/libcontainer/configs/namespaces_linux.go b/vendor/github.com/opencontainers/runc/libcontainer/configs/namespaces_linux.go index 1bbaef9bd9..5fc171a57b 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/configs/namespaces_linux.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/configs/namespaces_linux.go @@ -7,13 +7,12 @@ import ( ) const ( - NEWNET NamespaceType = "NEWNET" - NEWPID NamespaceType = "NEWPID" - NEWNS NamespaceType = "NEWNS" - NEWUTS NamespaceType = "NEWUTS" - NEWIPC NamespaceType = "NEWIPC" - NEWUSER NamespaceType = "NEWUSER" - NEWCGROUP NamespaceType = "NEWCGROUP" + NEWNET NamespaceType = "NEWNET" + NEWPID NamespaceType = "NEWPID" + NEWNS NamespaceType = "NEWNS" + NEWUTS NamespaceType = "NEWUTS" + NEWIPC NamespaceType = "NEWIPC" + NEWUSER NamespaceType = "NEWUSER" ) var ( @@ -36,8 +35,6 @@ func NsName(ns NamespaceType) string { return "user" case NEWUTS: return "uts" - case NEWCGROUP: - return "cgroup" } return "" } @@ -71,7 +68,6 @@ func NamespaceTypes() []NamespaceType { NEWNET, NEWPID, NEWNS, - NEWCGROUP, } } diff --git a/vendor/github.com/opencontainers/runc/libcontainer/configs/namespaces_syscall.go b/vendor/github.com/opencontainers/runc/libcontainer/configs/namespaces_syscall.go index 2dc7adfc96..4ce6813d23 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/configs/namespaces_syscall.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/configs/namespaces_syscall.go @@ -9,13 +9,12 @@ func (n *Namespace) Syscall() int { } var namespaceInfo = map[NamespaceType]int{ - NEWNET: unix.CLONE_NEWNET, - NEWNS: unix.CLONE_NEWNS, - NEWUSER: unix.CLONE_NEWUSER, - NEWIPC: unix.CLONE_NEWIPC, - NEWUTS: unix.CLONE_NEWUTS, - NEWPID: unix.CLONE_NEWPID, - NEWCGROUP: unix.CLONE_NEWCGROUP, + NEWNET: unix.CLONE_NEWNET, + NEWNS: unix.CLONE_NEWNS, + NEWUSER: unix.CLONE_NEWUSER, + NEWIPC: unix.CLONE_NEWIPC, + NEWUTS: unix.CLONE_NEWUTS, + NEWPID: unix.CLONE_NEWPID, } // CloneFlags parses the container's Namespaces options to set the correct diff --git a/vendor/github.com/opencontainers/runc/libcontainer/configs/validate/rootless.go b/vendor/github.com/opencontainers/runc/libcontainer/configs/validate/rootless.go index 393d9e81ee..e532ac8fe2 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/configs/validate/rootless.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/configs/validate/rootless.go @@ -2,18 +2,23 @@ package validate import ( "fmt" + "os" + "reflect" "strings" "github.com/opencontainers/runc/libcontainer/configs" ) -// rootlessEUID makes sure that the config can be applied when runc -// is being executed as a non-root user (euid != 0) in the current user namespace. -func (v *ConfigValidator) rootlessEUID(config *configs.Config) error { - if err := rootlessEUIDMappings(config); err != nil { +var ( + geteuid = os.Geteuid + getegid = os.Getegid +) + +func (v *ConfigValidator) rootless(config *configs.Config) error { + if err := rootlessMappings(config); err != nil { return err } - if err := rootlessEUIDMount(config); err != nil { + if err := rootlessMount(config); err != nil { return err } @@ -33,9 +38,11 @@ func hasIDMapping(id int, mappings []configs.IDMap) bool { return false } -func rootlessEUIDMappings(config *configs.Config) error { - if !config.Namespaces.Contains(configs.NEWUSER) { - return fmt.Errorf("rootless container requires user namespaces") +func rootlessMappings(config *configs.Config) error { + if euid := geteuid(); euid != 0 { + if !config.Namespaces.Contains(configs.NEWUSER) { + return fmt.Errorf("rootless containers require user namespaces") + } } if len(config.UidMappings) == 0 { @@ -44,13 +51,34 @@ func rootlessEUIDMappings(config *configs.Config) error { if len(config.GidMappings) == 0 { return fmt.Errorf("rootless containers requires at least one GID mapping") } + + return nil +} + +// cgroup verifies that the user isn't trying to set any cgroup limits or paths. +func rootlessCgroup(config *configs.Config) error { + // Nothing set at all. + if config.Cgroups == nil || config.Cgroups.Resources == nil { + return nil + } + + // Used for comparing to the zero value. + left := reflect.ValueOf(*config.Cgroups.Resources) + right := reflect.Zero(left.Type()) + + // This is all we need to do, since specconv won't add cgroup options in + // rootless mode. + if !reflect.DeepEqual(left.Interface(), right.Interface()) { + return fmt.Errorf("cannot specify resource limits in rootless container") + } + return nil } // mount verifies that the user isn't trying to set up any mounts they don't have // the rights to do. In addition, it makes sure that no mount has a `uid=` or // `gid=` option that doesn't resolve to root. -func rootlessEUIDMount(config *configs.Config) error { +func rootlessMount(config *configs.Config) error { // XXX: We could whitelist allowed devices at this point, but I'm not // convinced that's a good idea. The kernel is the best arbiter of // access control. diff --git a/vendor/github.com/opencontainers/runc/libcontainer/configs/validate/validator.go b/vendor/github.com/opencontainers/runc/libcontainer/configs/validate/validator.go index 3b42f30107..cbbba9a03a 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/configs/validate/validator.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/configs/validate/validator.go @@ -38,17 +38,14 @@ func (v *ConfigValidator) Validate(config *configs.Config) error { if err := v.usernamespace(config); err != nil { return err } - if err := v.cgroupnamespace(config); err != nil { - return err - } if err := v.sysctl(config); err != nil { return err } if err := v.intelrdt(config); err != nil { return err } - if config.RootlessEUID { - if err := v.rootlessEUID(config); err != nil { + if config.Rootless { + if err := v.rootless(config); err != nil { return err } } @@ -119,15 +116,6 @@ func (v *ConfigValidator) usernamespace(config *configs.Config) error { return nil } -func (v *ConfigValidator) cgroupnamespace(config *configs.Config) error { - if config.Namespaces.Contains(configs.NEWCGROUP) { - if _, err := os.Stat("/proc/self/ns/cgroup"); os.IsNotExist(err) { - return fmt.Errorf("cgroup namespaces aren't enabled in the kernel") - } - } - return nil -} - // sysctl validates that the specified sysctl keys are valid or not. // /proc/sys isn't completely namespaced and depending on which namespaces // are specified, a subset of sysctls are permitted. @@ -163,16 +151,6 @@ func (v *ConfigValidator) sysctl(config *configs.Config) error { return fmt.Errorf("sysctl %q is not allowed in the hosts network namespace", s) } } - if config.Namespaces.Contains(configs.NEWUTS) { - switch s { - case "kernel.domainname": - // This is namespaced and there's no explicit OCI field for it. - continue - case "kernel.hostname": - // This is namespaced but there's a conflicting (dedicated) OCI field for it. - return fmt.Errorf("sysctl %q is not allowed as it conflicts with the OCI %q field", s, "hostname") - } - } return fmt.Errorf("sysctl %q is not in a separate kernel namespace", s) } @@ -181,22 +159,11 @@ func (v *ConfigValidator) sysctl(config *configs.Config) error { func (v *ConfigValidator) intelrdt(config *configs.Config) error { if config.IntelRdt != nil { - if !intelrdt.IsCatEnabled() && !intelrdt.IsMbaEnabled() { - return fmt.Errorf("intelRdt is specified in config, but Intel RDT is not supported or enabled") - } - - if !intelrdt.IsCatEnabled() && config.IntelRdt.L3CacheSchema != "" { - return fmt.Errorf("intelRdt.l3CacheSchema is specified in config, but Intel RDT/CAT is not enabled") - } - if !intelrdt.IsMbaEnabled() && config.IntelRdt.MemBwSchema != "" { - return fmt.Errorf("intelRdt.memBwSchema is specified in config, but Intel RDT/MBA is not enabled") - } - - if intelrdt.IsCatEnabled() && config.IntelRdt.L3CacheSchema == "" { - return fmt.Errorf("Intel RDT/CAT is enabled and intelRdt is specified in config, but intelRdt.l3CacheSchema is empty") + if !intelrdt.IsEnabled() { + return fmt.Errorf("intelRdt is specified in config, but Intel RDT feature is not supported or enabled") } - if intelrdt.IsMbaEnabled() && config.IntelRdt.MemBwSchema == "" { - return fmt.Errorf("Intel RDT/MBA is enabled and intelRdt is specified in config, but intelRdt.memBwSchema is empty") + if config.IntelRdt.L3CacheSchema == "" { + return fmt.Errorf("intelRdt is specified in config, but intelRdt.l3CacheSchema is empty") } } diff --git a/vendor/github.com/opencontainers/runc/libcontainer/container.go b/vendor/github.com/opencontainers/runc/libcontainer/container.go index ba7541c5fd..2e31b4d4fc 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/container.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/container.go @@ -9,7 +9,6 @@ import ( "time" "github.com/opencontainers/runc/libcontainer/configs" - "github.com/opencontainers/runtime-spec/specs-go" ) // Status is the status of a container. @@ -86,12 +85,6 @@ type BaseContainer interface { // SystemError - System error. State() (*State, error) - // OCIState returns the current container's state information. - // - // errors: - // SystemError - System error. - OCIState() (*specs.State, error) - // Returns the current config of the container. Config() configs.Config diff --git a/vendor/github.com/opencontainers/runc/libcontainer/container_linux.go b/vendor/github.com/opencontainers/runc/libcontainer/container_linux.go index 7e58e5e008..b3e157bdf8 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/container_linux.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/container_linux.go @@ -19,17 +19,16 @@ import ( "syscall" // only for SysProcAttr and Signal "time" - "github.com/cyphar/filepath-securejoin" "github.com/opencontainers/runc/libcontainer/cgroups" "github.com/opencontainers/runc/libcontainer/configs" + "github.com/opencontainers/runc/libcontainer/criurpc" "github.com/opencontainers/runc/libcontainer/intelrdt" "github.com/opencontainers/runc/libcontainer/system" "github.com/opencontainers/runc/libcontainer/utils" - "github.com/opencontainers/runtime-spec/specs-go" - criurpc "github.com/checkpoint-restore/go-criu/rpc" "github.com/golang/protobuf/proto" "github.com/sirupsen/logrus" + "github.com/syndtr/gocapability/capability" "github.com/vishvananda/netlink/nl" "golang.org/x/sys/unix" ) @@ -61,8 +60,7 @@ type State struct { // Platform specific fields below here - // Specified if the container was started under the rootless mode. - // Set to true if BaseState.Config.RootlessEUID && BaseState.Config.RootlessCgroups + // Specifies if the container was started under the rootless mode. Rootless bool `json:"rootless"` // Path to all the cgroups setup for a container. Key is cgroup subsystem name @@ -158,12 +156,6 @@ func (c *linuxContainer) State() (*State, error) { return c.currentState() } -func (c *linuxContainer) OCIState() (*specs.State, error) { - c.m.Lock() - defer c.m.Unlock() - return c.currentOCIState() -} - func (c *linuxContainer) Processes() ([]int, error) { pids, err := c.cgroupManager.GetAllPids() if err != nil { @@ -233,13 +225,17 @@ func (c *linuxContainer) Set(config configs.Config) error { func (c *linuxContainer) Start(process *Process) error { c.m.Lock() defer c.m.Unlock() - if process.Init { + status, err := c.currentStatus() + if err != nil { + return err + } + if status == Stopped { if err := c.createExecFifo(); err != nil { return err } } - if err := c.start(process); err != nil { - if process.Init { + if err := c.start(process, status == Stopped); err != nil { + if status == Stopped { c.deleteExecFifo() } return err @@ -248,10 +244,17 @@ func (c *linuxContainer) Start(process *Process) error { } func (c *linuxContainer) Run(process *Process) error { + c.m.Lock() + status, err := c.currentStatus() + if err != nil { + c.m.Unlock() + return err + } + c.m.Unlock() if err := c.Start(process); err != nil { return err } - if process.Init { + if status == Stopped { return c.exec() } return nil @@ -332,8 +335,8 @@ type openResult struct { err error } -func (c *linuxContainer) start(process *Process) error { - parent, err := c.newParentProcess(process) +func (c *linuxContainer) start(process *Process, isInit bool) error { + parent, err := c.newParentProcess(process, isInit) if err != nil { return newSystemErrorWithCause(err, "creating new parent process") } @@ -346,7 +349,7 @@ func (c *linuxContainer) start(process *Process) error { } // generate a timestamp indicating when the container was started c.created = time.Now().UTC() - if process.Init { + if isInit { c.state = &createdState{ c: c, } @@ -357,9 +360,13 @@ func (c *linuxContainer) start(process *Process) error { c.initProcessStartTime = state.InitProcessStartTime if c.config.Hooks != nil { - s, err := c.currentOCIState() - if err != nil { - return err + bundle, annotations := utils.Annotations(c.config.Labels) + s := configs.HookState{ + Version: c.config.Version, + ID: c.id, + Pid: parent.pid(), + Bundle: bundle, + Annotations: annotations, } for i, hook := range c.config.Hooks.Poststart { if err := hook.Run(s); err != nil { @@ -378,18 +385,10 @@ func (c *linuxContainer) Signal(s os.Signal, all bool) error { if all { return signalAllProcesses(c.cgroupManager, s) } - status, err := c.currentStatus() - if err != nil { - return err - } - // to avoid a PID reuse attack - if status == Running || status == Created || status == Paused { - if err := c.initProcess.signal(s); err != nil { - return newSystemErrorWithCause(err, "signaling init process") - } - return nil + if err := c.initProcess.signal(s); err != nil { + return newSystemErrorWithCause(err, "signaling init process") } - return newGenericError(fmt.Errorf("container not running"), ContainerNotRunning) + return nil } func (c *linuxContainer) createExecFifo() error { @@ -412,7 +411,10 @@ func (c *linuxContainer) createExecFifo() error { return err } unix.Umask(oldMask) - return os.Chown(fifoName, rootuid, rootgid) + if err := os.Chown(fifoName, rootuid, rootgid); err != nil { + return err + } + return nil } func (c *linuxContainer) deleteExecFifo() { @@ -437,7 +439,7 @@ func (c *linuxContainer) includeExecFifo(cmd *exec.Cmd) error { return nil } -func (c *linuxContainer) newParentProcess(p *Process) (parentProcess, error) { +func (c *linuxContainer) newParentProcess(p *Process, doInit bool) (parentProcess, error) { parentPipe, childPipe, err := utils.NewSockPair("init") if err != nil { return nil, newSystemErrorWithCause(err, "creating new init pipe") @@ -446,7 +448,7 @@ func (c *linuxContainer) newParentProcess(p *Process) (parentProcess, error) { if err != nil { return nil, newSystemErrorWithCause(err, "creating new command template") } - if !p.Init { + if !doInit { return c.newSetnsProcess(p, cmd, parentPipe, childPipe) } @@ -471,7 +473,6 @@ func (c *linuxContainer) commandTemplate(p *Process, childPipe *os.File) (*exec. if cmd.SysProcAttr == nil { cmd.SysProcAttr = &syscall.SysProcAttr{} } - cmd.Env = append(cmd.Env, fmt.Sprintf("GOMAXPROCS=%s", os.Getenv("GOMAXPROCS"))) cmd.ExtraFiles = append(cmd.ExtraFiles, p.ExtraFiles...) if p.ConsoleSocket != nil { cmd.ExtraFiles = append(cmd.ExtraFiles, p.ConsoleSocket) @@ -482,7 +483,6 @@ func (c *linuxContainer) commandTemplate(p *Process, childPipe *os.File) (*exec. cmd.ExtraFiles = append(cmd.ExtraFiles, childPipe) cmd.Env = append(cmd.Env, fmt.Sprintf("_LIBCONTAINER_INITPIPE=%d", stdioFdCount+len(cmd.ExtraFiles)-1), - fmt.Sprintf("_LIBCONTAINER_STATEDIR=%s", c.root), ) // NOTE: when running a container with no PID namespace and the parent process spawning the container is // PID1 the pdeathsig is being delivered to the container's init process by the kernel for some reason @@ -506,7 +506,7 @@ func (c *linuxContainer) newInitProcess(p *Process, cmd *exec.Cmd, parentPipe, c if err != nil { return nil, err } - init := &initProcess{ + return &initProcess{ cmd: cmd, childPipe: childPipe, parentPipe: parentPipe, @@ -517,9 +517,7 @@ func (c *linuxContainer) newInitProcess(p *Process, cmd *exec.Cmd, parentPipe, c process: p, bootstrapData: data, sharePidns: sharePidns, - } - c.initProcess = init - return init, nil + }, nil } func (c *linuxContainer) newSetnsProcess(p *Process, cmd *exec.Cmd, parentPipe, childPipe *os.File) (*setnsProcess, error) { @@ -535,15 +533,14 @@ func (c *linuxContainer) newSetnsProcess(p *Process, cmd *exec.Cmd, parentPipe, return nil, err } return &setnsProcess{ - cmd: cmd, - cgroupPaths: c.cgroupManager.GetPaths(), - rootlessCgroups: c.config.RootlessCgroups, - intelRdtPath: state.IntelRdtPath, - childPipe: childPipe, - parentPipe: parentPipe, - config: c.newInitConfig(p), - process: p, - bootstrapData: data, + cmd: cmd, + cgroupPaths: c.cgroupManager.GetPaths(), + intelRdtPath: state.IntelRdtPath, + childPipe: childPipe, + parentPipe: parentPipe, + config: c.newInitConfig(p), + process: p, + bootstrapData: data, }, nil } @@ -559,8 +556,7 @@ func (c *linuxContainer) newInitConfig(process *Process) *initConfig { PassedFilesCount: len(process.ExtraFiles), ContainerId: c.ID(), NoNewPrivileges: c.config.NoNewPrivileges, - RootlessEUID: c.config.RootlessEUID, - RootlessCgroups: c.config.RootlessCgroups, + Rootless: c.config.Rootless, AppArmorProfile: c.config.AppArmorProfile, ProcessLabel: c.config.ProcessLabel, Rlimits: c.config.Rlimits, @@ -628,16 +624,16 @@ func (c *linuxContainer) Resume() error { func (c *linuxContainer) NotifyOOM() (<-chan struct{}, error) { // XXX(cyphar): This requires cgroups. - if c.config.RootlessCgroups { - logrus.Warn("getting OOM notifications may fail if you don't have the full access to cgroups") + if c.config.Rootless { + return nil, fmt.Errorf("cannot get OOM notifications from rootless container") } return notifyOnOOM(c.cgroupManager.GetPaths()) } func (c *linuxContainer) NotifyMemoryPressure(level PressureLevel) (<-chan struct{}, error) { // XXX(cyphar): This requires cgroups. - if c.config.RootlessCgroups { - logrus.Warn("getting memory pressure notifications may fail if you don't have the full access to cgroups") + if c.config.Rootless { + return nil, fmt.Errorf("cannot get memory pressure notifications from rootless container") } return notifyMemoryPressure(c.cgroupManager.GetPaths(), level) } @@ -672,7 +668,7 @@ func (c *linuxContainer) checkCriuFeatures(criuOpts *CriuOpts, rpcOpts *criurpc. Features: criuFeat, } - err := c.criuSwrk(nil, req, criuOpts, false, nil) + err := c.criuSwrk(nil, req, criuOpts, false) if err != nil { logrus.Debugf("%s", err) return fmt.Errorf("CRIU feature check failed") @@ -785,7 +781,7 @@ func (c *linuxContainer) checkCriuVersion(minVersion int) error { Type: &t, } - err := c.criuSwrk(nil, req, nil, false, nil) + err := c.criuSwrk(nil, req, nil, false) if err != nil { return fmt.Errorf("CRIU version check failed: %s", err) } @@ -877,41 +873,16 @@ func waitForCriuLazyServer(r *os.File, status string) error { return nil } -func (c *linuxContainer) handleCriuConfigurationFile(rpcOpts *criurpc.CriuOpts) { - // CRIU will evaluate a configuration starting with release 3.11. - // Settings in the configuration file will overwrite RPC settings. - // Look for annotations. The annotation 'org.criu.config' - // specifies if CRIU should use a different, container specific - // configuration file. - _, annotations := utils.Annotations(c.config.Labels) - configFile, exists := annotations["org.criu.config"] - if exists { - // If the annotation 'org.criu.config' exists and is set - // to a non-empty string, tell CRIU to use that as a - // configuration file. If the file does not exist, CRIU - // will just ignore it. - if configFile != "" { - rpcOpts.ConfigFile = proto.String(configFile) - } - // If 'org.criu.config' exists and is set to an empty - // string, a runc specific CRIU configuration file will - // be not set at all. - } else { - // If the mentioned annotation has not been found, specify - // a default CRIU configuration file. - rpcOpts.ConfigFile = proto.String("/etc/criu/runc.conf") - } -} - func (c *linuxContainer) Checkpoint(criuOpts *CriuOpts) error { c.m.Lock() defer c.m.Unlock() - // Checkpoint is unlikely to work if os.Geteuid() != 0 || system.RunningInUserNS(). - // (CLI prints a warning) // TODO(avagin): Figure out how to make this work nicely. CRIU 2.0 has // support for doing unprivileged dumps, but the setup of // rootless containers might make this complicated. + if c.config.Rootless { + return fmt.Errorf("cannot checkpoint a rootless container") + } // criu 1.5.2 => 10502 if err := c.checkCriuVersion(10502); err != nil { @@ -968,35 +939,6 @@ func (c *linuxContainer) Checkpoint(criuOpts *CriuOpts) error { LazyPages: proto.Bool(criuOpts.LazyPages), } - c.handleCriuConfigurationFile(&rpcOpts) - - // If the container is running in a network namespace and has - // a path to the network namespace configured, we will dump - // that network namespace as an external namespace and we - // will expect that the namespace exists during restore. - // This basically means that CRIU will ignore the namespace - // and expect to be setup correctly. - nsPath := c.config.Namespaces.PathOf(configs.NEWNET) - if nsPath != "" { - // For this to work we need at least criu 3.11.0 => 31100. - // As there was already a successful version check we will - // not error out if it fails. runc will just behave as it used - // to do and ignore external network namespaces. - err := c.checkCriuVersion(31100) - if err == nil { - // CRIU expects the information about an external namespace - // like this: --external net[]: - // This is always 'extRootNetNS'. - var netns syscall.Stat_t - err = syscall.Stat(nsPath, &netns) - if err != nil { - return err - } - criuExternal := fmt.Sprintf("net[%d]:extRootNetNS", netns.Ino) - rpcOpts.External = append(rpcOpts.External, criuExternal) - } - } - fcg := c.cgroupManager.GetPaths()["freezer"] if fcg != "" { rpcOpts.FreezeCgroup = proto.String(fcg) @@ -1101,7 +1043,7 @@ func (c *linuxContainer) Checkpoint(criuOpts *CriuOpts) error { } } - err = c.criuSwrk(nil, req, criuOpts, false, nil) + err = c.criuSwrk(nil, req, criuOpts, false) if err != nil { return err } @@ -1141,85 +1083,15 @@ func (c *linuxContainer) restoreNetwork(req *criurpc.CriuReq, criuOpts *CriuOpts } } -// makeCriuRestoreMountpoints makes the actual mountpoints for the -// restore using CRIU. This function is inspired from the code in -// rootfs_linux.go -func (c *linuxContainer) makeCriuRestoreMountpoints(m *configs.Mount) error { - switch m.Device { - case "cgroup": - // Do nothing for cgroup, CRIU should handle it - case "bind": - // The prepareBindMount() function checks if source - // exists. So it cannot be used for other filesystem types. - if err := prepareBindMount(m, c.config.Rootfs); err != nil { - return err - } - default: - // for all other file-systems just create the mountpoints - dest, err := securejoin.SecureJoin(c.config.Rootfs, m.Destination) - if err != nil { - return err - } - if err := checkMountDestination(c.config.Rootfs, dest); err != nil { - return err - } - m.Destination = dest - if err := os.MkdirAll(dest, 0755); err != nil { - return err - } - } - return nil -} - -// isPathInPrefixList is a small function for CRIU restore to make sure -// mountpoints, which are on a tmpfs, are not created in the roofs -func isPathInPrefixList(path string, prefix []string) bool { - for _, p := range prefix { - if strings.HasPrefix(path, p+"/") { - return false - } - } - return true -} - -// prepareCriuRestoreMounts tries to set up the rootfs of the -// container to be restored in the same way runc does it for -// initial container creation. Even for a read-only rootfs container -// runc modifies the rootfs to add mountpoints which do not exist. -// This function also creates missing mountpoints as long as they -// are not on top of a tmpfs, as CRIU will restore tmpfs content anyway. -func (c *linuxContainer) prepareCriuRestoreMounts(mounts []*configs.Mount) error { - // First get a list of a all tmpfs mounts - tmpfs := []string{} - for _, m := range mounts { - switch m.Device { - case "tmpfs": - tmpfs = append(tmpfs, m.Destination) - } - } - // Now go through all mounts and create the mountpoints - // if the mountpoints are not on a tmpfs, as CRIU will - // restore the complete tmpfs content from its checkpoint. - for _, m := range mounts { - if isPathInPrefixList(m.Destination, tmpfs) { - if err := c.makeCriuRestoreMountpoints(m); err != nil { - return err - } - } - } - return nil -} - func (c *linuxContainer) Restore(process *Process, criuOpts *CriuOpts) error { c.m.Lock() defer c.m.Unlock() - var extraFiles []*os.File - - // Restore is unlikely to work if os.Geteuid() != 0 || system.RunningInUserNS(). - // (CLI prints a warning) // TODO(avagin): Figure out how to make this work nicely. CRIU doesn't have // support for unprivileged restore at the moment. + if c.config.Rootless { + return fmt.Errorf("cannot restore a rootless container") + } // criu 1.5.2 => 10502 if err := c.checkCriuVersion(10502); err != nil { @@ -1289,46 +1161,6 @@ func (c *linuxContainer) Restore(process *Process, criuOpts *CriuOpts) error { }, } - c.handleCriuConfigurationFile(req.Opts) - - // Same as during checkpointing. If the container has a specific network namespace - // assigned to it, this now expects that the checkpoint will be restored in a - // already created network namespace. - nsPath := c.config.Namespaces.PathOf(configs.NEWNET) - if nsPath != "" { - // For this to work we need at least criu 3.11.0 => 31100. - // As there was already a successful version check we will - // not error out if it fails. runc will just behave as it used - // to do and ignore external network namespaces. - err := c.checkCriuVersion(31100) - if err == nil { - // CRIU wants the information about an existing network namespace - // like this: --inherit-fd fd[]: - // The needs to be the same as during checkpointing. - // We are always using 'extRootNetNS' as the key in this. - netns, err := os.Open(nsPath) - defer netns.Close() - if err != nil { - logrus.Errorf("If a specific network namespace is defined it must exist: %s", err) - return fmt.Errorf("Requested network namespace %v does not exist", nsPath) - } - inheritFd := new(criurpc.InheritFd) - inheritFd.Key = proto.String("extRootNetNS") - // The offset of four is necessary because 0, 1, 2 and 3 is already - // used by stdin, stdout, stderr, 'criu swrk' socket. - inheritFd.Fd = proto.Int32(int32(4 + len(extraFiles))) - req.Opts.InheritFd = append(req.Opts.InheritFd, inheritFd) - // All open FDs need to be transferred to CRIU via extraFiles - extraFiles = append(extraFiles, netns) - } - } - - // This will modify the rootfs of the container in the same way runc - // modifies the container during initial creation. - if err := c.prepareCriuRestoreMounts(c.config.Mounts); err != nil { - return err - } - for _, m := range c.config.Mounts { switch m.Device { case "bind": @@ -1387,7 +1219,7 @@ func (c *linuxContainer) Restore(process *Process, criuOpts *CriuOpts) error { req.Opts.InheritFd = append(req.Opts.InheritFd, inheritFd) } } - return c.criuSwrk(process, req, criuOpts, true, extraFiles) + return c.criuSwrk(process, req, criuOpts, true) } func (c *linuxContainer) criuApplyCgroups(pid int, req *criurpc.CriuReq) error { @@ -1417,7 +1249,7 @@ func (c *linuxContainer) criuApplyCgroups(pid int, req *criurpc.CriuReq) error { return nil } -func (c *linuxContainer) criuSwrk(process *Process, req *criurpc.CriuReq, opts *CriuOpts, applyCgroups bool, extraFiles []*os.File) error { +func (c *linuxContainer) criuSwrk(process *Process, req *criurpc.CriuReq, opts *CriuOpts, applyCgroups bool) error { fds, err := unix.Socketpair(unix.AF_LOCAL, unix.SOCK_SEQPACKET|unix.SOCK_CLOEXEC, 0) if err != nil { return err @@ -1458,9 +1290,6 @@ func (c *linuxContainer) criuSwrk(process *Process, req *criurpc.CriuReq, opts * cmd.Stderr = process.Stderr } cmd.ExtraFiles = append(cmd.ExtraFiles, criuServer) - if extraFiles != nil { - cmd.ExtraFiles = append(cmd.ExtraFiles, extraFiles...) - } if err := cmd.Start(); err != nil { return err @@ -1657,11 +1486,14 @@ func (c *linuxContainer) criuNotifications(resp *criurpc.CriuResp, process *Proc } case notify.GetScript() == "setup-namespaces": if c.config.Hooks != nil { - s, err := c.currentOCIState() - if err != nil { - return nil + bundle, annotations := utils.Annotations(c.config.Labels) + s := configs.HookState{ + Version: c.config.Version, + ID: c.id, + Pid: int(notify.GetPid()), + Bundle: bundle, + Annotations: annotations, } - s.Pid = int(notify.GetPid()) for i, hook := range c.config.Hooks.Prestart { if err := hook.Run(s); err != nil { return newSystemErrorWithCausef(err, "running prestart hook %d", i) @@ -1832,7 +1664,7 @@ func (c *linuxContainer) currentState() (*State, error) { InitProcessStartTime: startTime, Created: c.created, }, - Rootless: c.config.RootlessEUID && c.config.RootlessCgroups, + Rootless: c.config.Rootless, CgroupPaths: c.cgroupManager.GetPaths(), IntelRdtPath: intelRdtPath, NamespacePaths: make(map[configs.NamespaceType]string), @@ -1855,31 +1687,11 @@ func (c *linuxContainer) currentState() (*State, error) { return state, nil } -func (c *linuxContainer) currentOCIState() (*specs.State, error) { - bundle, annotations := utils.Annotations(c.config.Labels) - state := &specs.State{ - Version: specs.Version, - ID: c.ID(), - Bundle: bundle, - Annotations: annotations, - } - status, err := c.currentStatus() - if err != nil { - return nil, err - } - state.Status = status.String() - if status != Stopped { - if c.initProcess != nil { - state.Pid = c.initProcess.pid() - } - } - return state, nil -} - // orderNamespacePaths sorts namespace paths into a list of paths that we // can setns in order. func (c *linuxContainer) orderNamespacePaths(namespaces map[configs.NamespaceType]string) ([]string, error) { paths := []string{} + for _, ns := range configs.NamespaceTypes() { // Remove namespaces that we don't need to join. @@ -1953,7 +1765,7 @@ func (c *linuxContainer) bootstrapData(cloneFlags uintptr, nsMaps map[configs.Na if !joinExistingUser { // write uid mappings if len(c.config.UidMappings) > 0 { - if c.config.RootlessEUID && c.newuidmapPath != "" { + if c.config.Rootless && c.newuidmapPath != "" { r.AddData(&Bytemsg{ Type: UidmapPathAttr, Value: []byte(c.newuidmapPath), @@ -1979,33 +1791,39 @@ func (c *linuxContainer) bootstrapData(cloneFlags uintptr, nsMaps map[configs.Na Type: GidmapAttr, Value: b, }) - if c.config.RootlessEUID && c.newgidmapPath != "" { + if c.config.Rootless && c.newgidmapPath != "" { r.AddData(&Bytemsg{ Type: GidmapPathAttr, Value: []byte(c.newgidmapPath), }) } - if requiresRootOrMappingTool(c.config) { - r.AddData(&Boolmsg{ - Type: SetgroupAttr, - Value: true, - }) + // The following only applies if we are root. + if !c.config.Rootless { + // check if we have CAP_SETGID to setgroup properly + pid, err := capability.NewPid(0) + if err != nil { + return nil, err + } + if !pid.Get(capability.EFFECTIVE, capability.CAP_SETGID) { + r.AddData(&Boolmsg{ + Type: SetgroupAttr, + Value: true, + }) + } } } } - if c.config.OomScoreAdj != nil { - // write oom_score_adj - r.AddData(&Bytemsg{ - Type: OomScoreAdjAttr, - Value: []byte(fmt.Sprintf("%d", *c.config.OomScoreAdj)), - }) - } + // write oom_score_adj + r.AddData(&Bytemsg{ + Type: OomScoreAdjAttr, + Value: []byte(fmt.Sprintf("%d", c.config.OomScoreAdj)), + }) // write rootless r.AddData(&Boolmsg{ - Type: RootlessEUIDAttr, - Value: c.config.RootlessEUID, + Type: RootlessAttr, + Value: c.config.Rootless, }) return bytes.NewReader(r.Serialize()), nil @@ -2025,10 +1843,3 @@ func ignoreTerminateErrors(err error) error { } return err } - -func requiresRootOrMappingTool(c *configs.Config) bool { - gidMap := []configs.IDMap{ - {ContainerID: 0, HostID: os.Getegid(), Size: 1}, - } - return !reflect.DeepEqual(c.GidMappings, gidMap) -} diff --git a/vendor/github.com/checkpoint-restore/go-criu/rpc/rpc.pb.go b/vendor/github.com/opencontainers/runc/libcontainer/criurpc/criurpc.pb.go similarity index 70% rename from vendor/github.com/checkpoint-restore/go-criu/rpc/rpc.pb.go rename to vendor/github.com/opencontainers/runc/libcontainer/criurpc/criurpc.pb.go index 230faace55..21af9db971 100644 --- a/vendor/github.com/checkpoint-restore/go-criu/rpc/rpc.pb.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/criurpc/criurpc.pb.go @@ -1,11 +1,12 @@ -// Code generated by protoc-gen-go. DO NOT EDIT. -// source: rpc/rpc.proto +// Code generated by protoc-gen-go. +// source: criurpc.proto +// DO NOT EDIT! /* -Package rpc is a generated protocol buffer package. +Package criurpc is a generated protocol buffer package. It is generated from these files: - rpc/rpc.proto + criurpc.proto It has these top-level messages: CriuPageServerInfo @@ -24,7 +25,7 @@ It has these top-level messages: CriuResp CriuVersion */ -package rpc +package criurpc import proto "github.com/golang/protobuf/proto" import fmt "fmt" @@ -93,19 +94,17 @@ func (CriuCgMode) EnumDescriptor() ([]byte, []int) { return fileDescriptor0, []i type CriuReqType int32 const ( - CriuReqType_EMPTY CriuReqType = 0 - CriuReqType_DUMP CriuReqType = 1 - CriuReqType_RESTORE CriuReqType = 2 - CriuReqType_CHECK CriuReqType = 3 - CriuReqType_PRE_DUMP CriuReqType = 4 - CriuReqType_PAGE_SERVER CriuReqType = 5 - CriuReqType_NOTIFY CriuReqType = 6 - CriuReqType_CPUINFO_DUMP CriuReqType = 7 - CriuReqType_CPUINFO_CHECK CriuReqType = 8 - CriuReqType_FEATURE_CHECK CriuReqType = 9 - CriuReqType_VERSION CriuReqType = 10 - CriuReqType_WAIT_PID CriuReqType = 11 - CriuReqType_PAGE_SERVER_CHLD CriuReqType = 12 + CriuReqType_EMPTY CriuReqType = 0 + CriuReqType_DUMP CriuReqType = 1 + CriuReqType_RESTORE CriuReqType = 2 + CriuReqType_CHECK CriuReqType = 3 + CriuReqType_PRE_DUMP CriuReqType = 4 + CriuReqType_PAGE_SERVER CriuReqType = 5 + CriuReqType_NOTIFY CriuReqType = 6 + CriuReqType_CPUINFO_DUMP CriuReqType = 7 + CriuReqType_CPUINFO_CHECK CriuReqType = 8 + CriuReqType_FEATURE_CHECK CriuReqType = 9 + CriuReqType_VERSION CriuReqType = 10 ) var CriuReqType_name = map[int32]string{ @@ -120,23 +119,19 @@ var CriuReqType_name = map[int32]string{ 8: "CPUINFO_CHECK", 9: "FEATURE_CHECK", 10: "VERSION", - 11: "WAIT_PID", - 12: "PAGE_SERVER_CHLD", } var CriuReqType_value = map[string]int32{ - "EMPTY": 0, - "DUMP": 1, - "RESTORE": 2, - "CHECK": 3, - "PRE_DUMP": 4, - "PAGE_SERVER": 5, - "NOTIFY": 6, - "CPUINFO_DUMP": 7, - "CPUINFO_CHECK": 8, - "FEATURE_CHECK": 9, - "VERSION": 10, - "WAIT_PID": 11, - "PAGE_SERVER_CHLD": 12, + "EMPTY": 0, + "DUMP": 1, + "RESTORE": 2, + "CHECK": 3, + "PRE_DUMP": 4, + "PAGE_SERVER": 5, + "NOTIFY": 6, + "CPUINFO_DUMP": 7, + "CPUINFO_CHECK": 8, + "FEATURE_CHECK": 9, + "VERSION": 10, } func (x CriuReqType) Enum() *CriuReqType { @@ -398,7 +393,6 @@ type CriuOpts struct { LazyPages *bool `protobuf:"varint,48,opt,name=lazy_pages,json=lazyPages" json:"lazy_pages,omitempty"` StatusFd *int32 `protobuf:"varint,49,opt,name=status_fd,json=statusFd" json:"status_fd,omitempty"` OrphanPtsMaster *bool `protobuf:"varint,50,opt,name=orphan_pts_master,json=orphanPtsMaster" json:"orphan_pts_master,omitempty"` - ConfigFile *string `protobuf:"bytes,51,opt,name=config_file,json=configFile" json:"config_file,omitempty"` XXX_unrecognized []byte `json:"-"` } @@ -754,13 +748,6 @@ func (m *CriuOpts) GetOrphanPtsMaster() bool { return false } -func (m *CriuOpts) GetConfigFile() string { - if m != nil && m.ConfigFile != nil { - return *m.ConfigFile - } - return "" -} - type CriuDumpResp struct { Restored *bool `protobuf:"varint,1,opt,name=restored" json:"restored,omitempty"` XXX_unrecognized []byte `json:"-"` @@ -861,10 +848,8 @@ type CriuReq struct { // 'features' can be used to query which features // are supported by the installed criu/kernel // via RPC. - Features *CriuFeatures `protobuf:"bytes,5,opt,name=features" json:"features,omitempty"` - // 'pid' is used for WAIT_PID - Pid *uint32 `protobuf:"varint,6,opt,name=pid" json:"pid,omitempty"` - XXX_unrecognized []byte `json:"-"` + Features *CriuFeatures `protobuf:"bytes,5,opt,name=features" json:"features,omitempty"` + XXX_unrecognized []byte `json:"-"` } func (m *CriuReq) Reset() { *m = CriuReq{} } @@ -907,13 +892,6 @@ func (m *CriuReq) GetFeatures() *CriuFeatures { return nil } -func (m *CriuReq) GetPid() uint32 { - if m != nil && m.Pid != nil { - return *m.Pid - } - return 0 -} - type CriuResp struct { Type *CriuReqType `protobuf:"varint,1,req,name=type,enum=CriuReqType" json:"type,omitempty"` Success *bool `protobuf:"varint,2,req,name=success" json:"success,omitempty"` @@ -925,7 +903,6 @@ type CriuResp struct { Features *CriuFeatures `protobuf:"bytes,8,opt,name=features" json:"features,omitempty"` CrErrmsg *string `protobuf:"bytes,9,opt,name=cr_errmsg,json=crErrmsg" json:"cr_errmsg,omitempty"` Version *CriuVersion `protobuf:"bytes,10,opt,name=version" json:"version,omitempty"` - Status *int32 `protobuf:"varint,11,opt,name=status" json:"status,omitempty"` XXX_unrecognized []byte `json:"-"` } @@ -1004,13 +981,6 @@ func (m *CriuResp) GetVersion() *CriuVersion { return nil } -func (m *CriuResp) GetStatus() int32 { - if m != nil && m.Status != nil { - return *m.Status - } - return 0 -} - // Answer for criu_req_type.VERSION requests type CriuVersion struct { Major *int32 `protobuf:"varint,1,req,name=major" json:"major,omitempty"` @@ -1089,123 +1059,120 @@ func init() { proto.RegisterEnum("CriuReqType", CriuReqType_name, CriuReqType_value) } -func init() { proto.RegisterFile("rpc/rpc.proto", fileDescriptor0) } +func init() { proto.RegisterFile("criurpc.proto", fileDescriptor0) } var fileDescriptor0 = []byte{ - // 1835 bytes of a gzipped FileDescriptorProto - 0x1f, 0x8b, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0xff, 0x8c, 0x56, 0xeb, 0x72, 0x5b, 0xb7, - 0x11, 0x0e, 0x29, 0xf1, 0x06, 0x5e, 0x7c, 0x0c, 0x5f, 0x02, 0xc7, 0xb5, 0xad, 0xd0, 0x51, 0xa2, - 0x2a, 0x2e, 0x93, 0x30, 0x76, 0x5c, 0x67, 0xda, 0x1f, 0x1e, 0x8a, 0x74, 0xd8, 0x48, 0x22, 0x07, - 0xa4, 0xdc, 0xc9, 0x2f, 0xcc, 0xd1, 0x39, 0x20, 0x05, 0xf3, 0xdc, 0x0a, 0x80, 0x8a, 0xe4, 0x97, - 0xe8, 0xbf, 0x3e, 0x57, 0xde, 0xa4, 0xaf, 0xd0, 0xd9, 0x05, 0x28, 0x4b, 0x49, 0x66, 0xd2, 0x7f, - 0xd8, 0x0f, 0xbb, 0xc0, 0xde, 0x77, 0x49, 0x5b, 0x17, 0xd1, 0x57, 0xba, 0x88, 0x7a, 0x85, 0xce, - 0x6d, 0xde, 0x5d, 0x92, 0x7b, 0x91, 0x56, 0x6b, 0x51, 0x84, 0x4b, 0x29, 0x8c, 0xd4, 0xe7, 0x52, - 0x0b, 0x95, 0x2d, 0x72, 0xca, 0x48, 0x2d, 0x8c, 0x63, 0x2d, 0x8d, 0x61, 0xa5, 0x9d, 0xd2, 0x5e, - 0x83, 0x6f, 0x48, 0x4a, 0xc9, 0x76, 0x91, 0x6b, 0xcb, 0xca, 0x3b, 0xa5, 0xbd, 0x0a, 0xc7, 0x33, - 0x0d, 0xc8, 0x56, 0xa1, 0x62, 0xb6, 0x85, 0x10, 0x1c, 0x69, 0x87, 0x94, 0x17, 0x31, 0xdb, 0x46, - 0xa0, 0xbc, 0x88, 0xbb, 0x7f, 0x23, 0x1d, 0xfc, 0xe8, 0x5c, 0xda, 0x33, 0x51, 0x84, 0x4a, 0xd3, - 0x3b, 0xa4, 0xa2, 0x16, 0x42, 0x65, 0xac, 0xb4, 0x53, 0xde, 0x6b, 0xf0, 0x6d, 0xb5, 0x18, 0x67, - 0xf4, 0x1e, 0xa9, 0xaa, 0x85, 0xc8, 0xd7, 0xf0, 0x3c, 0xa0, 0x15, 0xb5, 0x98, 0xac, 0x6d, 0xf7, - 0x5b, 0xd2, 0x96, 0x17, 0x56, 0xa4, 0xf9, 0x3a, 0xb3, 0x22, 0x0d, 0x0b, 0xf8, 0x70, 0x25, 0x2f, - 0xbd, 0x28, 0x1c, 0x01, 0x39, 0x0f, 0x13, 0x2f, 0x06, 0xc7, 0xee, 0x5b, 0xd2, 0x79, 0x97, 0xab, - 0x4c, 0x64, 0x61, 0x2a, 0x4d, 0x11, 0x46, 0x12, 0x94, 0xca, 0x8c, 0x17, 0x2a, 0x67, 0x86, 0x7e, - 0x4c, 0x6a, 0x99, 0x11, 0x0b, 0x95, 0x48, 0x2f, 0x57, 0xcd, 0xcc, 0x48, 0x25, 0x92, 0x3e, 0x24, - 0x0d, 0x79, 0x61, 0x75, 0x28, 0xf2, 0xc2, 0xa2, 0x55, 0x0d, 0x5e, 0x47, 0x60, 0x52, 0xd8, 0x6e, - 0x8f, 0x10, 0x95, 0x9d, 0x49, 0xad, 0xac, 0x58, 0xc4, 0xbf, 0xa3, 0x89, 0x33, 0x1d, 0x1e, 0x74, - 0xa6, 0xbf, 0x20, 0xcd, 0x68, 0xa9, 0xf3, 0x75, 0x21, 0x74, 0x9e, 0x5b, 0xf0, 0x5f, 0x64, 0x75, - 0xe2, 0xdd, 0x8a, 0x67, 0xf4, 0x69, 0x68, 0xcf, 0xbc, 0x16, 0x78, 0xee, 0x3e, 0x21, 0xb5, 0x75, - 0xa6, 0x2e, 0x84, 0x59, 0xd1, 0xbb, 0xa4, 0xa2, 0xb2, 0x3c, 0x96, 0xf8, 0x4b, 0x9b, 0x3b, 0xa2, - 0xfb, 0xdf, 0x36, 0x69, 0xa0, 0x4f, 0xf3, 0xc2, 0x1a, 0xda, 0x25, 0x6d, 0x95, 0x86, 0x4b, 0x69, - 0x44, 0xac, 0xb4, 0x58, 0xc4, 0xc8, 0x5b, 0xe1, 0x4d, 0x07, 0x1e, 0x28, 0x3d, 0x8a, 0x37, 0x61, - 0x2a, 0x7f, 0x08, 0xd3, 0x53, 0xd2, 0x4e, 0x64, 0x78, 0x2e, 0x85, 0x5e, 0x67, 0x99, 0xca, 0x96, - 0x68, 0x6c, 0x9d, 0xb7, 0x10, 0xe4, 0x0e, 0xa3, 0x8f, 0x49, 0x13, 0xbc, 0xef, 0xb5, 0xc1, 0xa0, - 0xd6, 0x39, 0x38, 0xe8, 0x24, 0x53, 0x17, 0xb3, 0x15, 0xfd, 0x82, 0xdc, 0xb2, 0x51, 0x21, 0xa4, - 0xb1, 0xe1, 0x69, 0xa2, 0xcc, 0x99, 0x8c, 0x59, 0x05, 0x79, 0x3a, 0x36, 0x2a, 0x86, 0x1f, 0x50, - 0x60, 0x94, 0xe7, 0xa1, 0x51, 0xe7, 0x52, 0xc4, 0xf2, 0x5c, 0x45, 0xd2, 0xb0, 0xaa, 0x63, 0xf4, - 0xf0, 0x81, 0x43, 0xc1, 0xff, 0xe6, 0x4c, 0x26, 0x89, 0x78, 0x97, 0x9f, 0xb2, 0x1a, 0xb2, 0xd4, - 0x11, 0xf8, 0x47, 0x7e, 0x4a, 0x1f, 0x11, 0x02, 0x21, 0x13, 0x49, 0x1e, 0xad, 0x0c, 0xab, 0x3b, - 0x6d, 0x00, 0x39, 0x04, 0x80, 0x3e, 0x26, 0x8d, 0x24, 0x5f, 0x8a, 0x44, 0x9e, 0xcb, 0x84, 0x35, - 0xc0, 0xd4, 0xef, 0x4b, 0x7d, 0x5e, 0x4f, 0xf2, 0xe5, 0x21, 0x40, 0xf4, 0x01, 0x81, 0xb3, 0x8b, - 0x3a, 0x71, 0xa9, 0x9d, 0xe4, 0x4b, 0x0c, 0xfb, 0xe7, 0xa4, 0x5c, 0x18, 0xd6, 0xdc, 0x29, 0xed, - 0x35, 0xfb, 0xf7, 0x7b, 0xbf, 0x5b, 0x18, 0xbc, 0x5c, 0x18, 0xba, 0x4b, 0x3a, 0x59, 0x6e, 0xd5, - 0xe2, 0x52, 0x98, 0x48, 0xab, 0xc2, 0x1a, 0xd6, 0x42, 0x2d, 0xda, 0x0e, 0x9d, 0x39, 0x10, 0xa2, - 0x0a, 0x11, 0x67, 0x6d, 0x17, 0x69, 0x8c, 0xfe, 0x23, 0x42, 0x8a, 0x50, 0xcb, 0xcc, 0x0a, 0x95, - 0x2e, 0x59, 0x07, 0x6f, 0x1a, 0x0e, 0x19, 0xa7, 0x4b, 0x30, 0xdc, 0xea, 0x30, 0x5a, 0x89, 0x54, - 0xa6, 0xec, 0x96, 0x33, 0x1c, 0x81, 0x23, 0x99, 0x82, 0x6c, 0xb8, 0xb6, 0xb9, 0x88, 0x65, 0xbc, - 0x2e, 0x58, 0xe0, 0x0c, 0x07, 0xe4, 0x00, 0x00, 0x08, 0xd3, 0xcf, 0xb9, 0x5e, 0x6d, 0xe2, 0x7f, - 0x1b, 0xa3, 0xdc, 0x00, 0xc8, 0x45, 0xff, 0x11, 0x21, 0x89, 0xca, 0x56, 0x42, 0xcb, 0x34, 0x2c, - 0x18, 0x75, 0xe2, 0x80, 0x70, 0x00, 0xe8, 0x2e, 0xa9, 0x40, 0x71, 0x1a, 0x76, 0x67, 0x67, 0x6b, - 0xaf, 0xd9, 0xbf, 0xd5, 0xbb, 0x59, 0xaf, 0xdc, 0xdd, 0xd2, 0xa7, 0xa4, 0x16, 0x15, 0x6b, 0x11, - 0x85, 0x05, 0xbb, 0xbb, 0x53, 0xda, 0x6b, 0x7f, 0x4f, 0x9e, 0xf7, 0x5f, 0x3d, 0x7f, 0xf5, 0xdd, - 0xcb, 0xfe, 0xab, 0x17, 0xbc, 0x1a, 0x15, 0xeb, 0x41, 0x58, 0xd0, 0x27, 0xa4, 0xb9, 0xc8, 0x75, - 0x24, 0x85, 0xd2, 0xf0, 0xd7, 0x3d, 0xfc, 0x8b, 0x20, 0x34, 0x06, 0x04, 0x82, 0x20, 0x2f, 0x64, - 0x24, 0xa2, 0x34, 0x66, 0xf7, 0x77, 0xb6, 0x20, 0x08, 0x40, 0x0f, 0x52, 0x48, 0x92, 0x1a, 0xd6, - 0x7a, 0x66, 0xd9, 0xc7, 0xa8, 0x49, 0xa7, 0x77, 0xa3, 0xf6, 0x79, 0x55, 0x5e, 0xd8, 0xa3, 0xcc, - 0x42, 0x14, 0xd2, 0x30, 0x83, 0xf8, 0xb8, 0xf2, 0x32, 0x8c, 0xb9, 0x28, 0x38, 0x74, 0xe0, 0x40, - 0xba, 0x4b, 0x6a, 0xd1, 0x12, 0x4b, 0x8f, 0x3d, 0xc0, 0xf7, 0x5a, 0xbd, 0x6b, 0xe5, 0xc8, 0xab, - 0xd1, 0x92, 0x43, 0x60, 0x9e, 0x90, 0xa6, 0x36, 0x56, 0x18, 0x75, 0x9a, 0x40, 0x1d, 0x7c, 0xe2, - 0x54, 0xd6, 0xc6, 0xce, 0x1c, 0x42, 0xf7, 0xaf, 0x97, 0x3d, 0x7b, 0x88, 0x4f, 0x35, 0x7b, 0x1f, - 0x20, 0xde, 0xf0, 0xe7, 0x51, 0x4c, 0x77, 0x48, 0x0b, 0x23, 0xb5, 0x31, 0xe4, 0x4f, 0xee, 0x35, - 0xc0, 0x86, 0x4e, 0xf9, 0x27, 0xae, 0xa6, 0xcc, 0x59, 0xa8, 0xe1, 0xbb, 0x47, 0x8e, 0x41, 0x5e, - 0xd8, 0x99, 0x43, 0x36, 0x0c, 0x69, 0x68, 0xac, 0xd4, 0x86, 0x3d, 0xbe, 0x62, 0x38, 0x72, 0x08, - 0xb8, 0xd0, 0xac, 0x54, 0x81, 0xef, 0x3f, 0x71, 0x2e, 0x04, 0x1a, 0x1e, 0x87, 0xf6, 0x95, 0x85, - 0xa7, 0x89, 0x14, 0x0b, 0xc3, 0x76, 0xf0, 0xae, 0xee, 0x80, 0x91, 0xa1, 0x7b, 0xa4, 0xe9, 0x2b, - 0x59, 0xa8, 0x2c, 0x67, 0x9f, 0xa2, 0x21, 0xf5, 0x9e, 0xc7, 0x78, 0x63, 0x8d, 0x45, 0x3d, 0xce, - 0x72, 0xfa, 0x77, 0x72, 0xe7, 0xa6, 0x83, 0x45, 0x0a, 0x4d, 0xa8, 0xbb, 0x53, 0xda, 0xeb, 0xf4, - 0xdb, 0x2e, 0x3f, 0xa2, 0x25, 0x82, 0xfc, 0xf6, 0x0d, 0xa7, 0x1f, 0xe5, 0xb1, 0x84, 0x8f, 0x96, - 0x67, 0xb9, 0xb1, 0x22, 0x51, 0xa9, 0xb2, 0xec, 0x29, 0x66, 0x4b, 0xed, 0x9b, 0xaf, 0x9f, 0xff, - 0xf5, 0xc5, 0xcb, 0xef, 0x38, 0xc1, 0xbb, 0x43, 0xb8, 0xa2, 0x7b, 0x24, 0xc0, 0x44, 0x11, 0x26, - 0x0a, 0x33, 0x01, 0xdd, 0xcf, 0xb0, 0xcf, 0x50, 0xed, 0x0e, 0xe2, 0xb3, 0x28, 0xcc, 0xa6, 0x80, - 0xd2, 0x4f, 0x20, 0x6f, 0xac, 0xd4, 0x59, 0x98, 0xb0, 0x5d, 0x6f, 0x98, 0xa7, 0x31, 0xa7, 0xd2, - 0xc2, 0x5e, 0x8a, 0xcc, 0xb0, 0xcf, 0xe1, 0x33, 0x5e, 0x43, 0xfa, 0x18, 0x6c, 0xae, 0xb9, 0x51, - 0x60, 0xd8, 0x17, 0x3e, 0xbb, 0x6f, 0x8e, 0x06, 0x5e, 0x05, 0xfa, 0xd8, 0xd0, 0x4f, 0x49, 0xcb, - 0x67, 0x47, 0xa1, 0xf3, 0xc2, 0xb0, 0x3f, 0x63, 0x85, 0xfa, 0x06, 0x3e, 0x05, 0x88, 0xee, 0x93, - 0xdb, 0xd7, 0x59, 0x5c, 0x27, 0xd9, 0x47, 0xbe, 0x5b, 0xd7, 0xf8, 0xb0, 0xa3, 0x3c, 0x27, 0xf7, - 0x3d, 0x6f, 0xbc, 0x4e, 0x0b, 0x11, 0xe5, 0x99, 0xd5, 0x79, 0x92, 0x48, 0xcd, 0xbe, 0x44, 0xed, - 0xef, 0xba, 0xdb, 0x83, 0x75, 0x5a, 0x0c, 0xae, 0xee, 0xa0, 0x2b, 0x2f, 0xb4, 0x94, 0xef, 0x37, - 0x8e, 0x67, 0xcf, 0xf0, 0xf5, 0x96, 0x03, 0x9d, 0x8f, 0x61, 0x42, 0x5b, 0x95, 0x4a, 0x98, 0x95, - 0x7f, 0x71, 0xd6, 0x7a, 0x92, 0x7e, 0x49, 0x28, 0xf4, 0x63, 0xcc, 0x0e, 0x95, 0x89, 0x45, 0xa2, - 0x96, 0x67, 0x96, 0xf5, 0x30, 0x83, 0xa0, 0x53, 0xcf, 0x56, 0xaa, 0x18, 0x67, 0x23, 0x84, 0xc1, - 0xe0, 0x9f, 0x65, 0xb8, 0x12, 0xe6, 0xd2, 0x44, 0x36, 0x31, 0xec, 0x2b, 0x64, 0x6b, 0x02, 0x36, - 0x73, 0x10, 0x36, 0x8e, 0xf0, 0xfd, 0x25, 0xf6, 0x42, 0xc3, 0xbe, 0xf6, 0x8d, 0x23, 0x7c, 0x7f, - 0x39, 0x05, 0x00, 0x9b, 0xb5, 0x0d, 0xed, 0xda, 0x40, 0x5d, 0x7c, 0x83, 0x5d, 0xa7, 0xee, 0x80, - 0x51, 0x0c, 0xce, 0xca, 0x75, 0x71, 0x06, 0x61, 0xb5, 0xc6, 0x67, 0x33, 0xeb, 0x3b, 0x55, 0xdc, - 0xc5, 0xd4, 0x1a, 0x97, 0xd2, 0x90, 0xf2, 0x51, 0x9e, 0x2d, 0x94, 0x6f, 0xce, 0xdf, 0xa2, 0xd1, - 0xc4, 0x41, 0xe0, 0xcd, 0xee, 0x33, 0xbf, 0x44, 0xa0, 0x2f, 0xb5, 0x34, 0x05, 0xe4, 0x83, 0x96, - 0xc6, 0xe6, 0x5a, 0xc6, 0x38, 0x50, 0xeb, 0xfc, 0x8a, 0xee, 0xee, 0x92, 0xdb, 0xc8, 0xed, 0x01, - 0x27, 0xe0, 0x47, 0xa0, 0x1b, 0x8e, 0x70, 0xec, 0xbe, 0x24, 0x4d, 0x64, 0x73, 0xbd, 0x9b, 0xde, - 0x27, 0x55, 0xd7, 0xd4, 0xfd, 0x80, 0xf6, 0xd4, 0x6f, 0x67, 0x67, 0xf7, 0x47, 0xd2, 0x46, 0xc1, - 0x85, 0x0c, 0xed, 0x5a, 0x3b, 0x47, 0xa4, 0x32, 0x15, 0xd8, 0xaf, 0x37, 0xda, 0xa4, 0x32, 0x9d, - 0x03, 0xfd, 0x2b, 0x27, 0x96, 0x7f, 0xe5, 0xc4, 0xee, 0x2f, 0x25, 0x52, 0xf7, 0xda, 0xfe, 0x8b, - 0x76, 0xc9, 0xb6, 0xbd, 0x2c, 0xdc, 0xb8, 0xef, 0xf4, 0x3b, 0xbd, 0xcd, 0x85, 0x00, 0x94, 0xe3, - 0x1d, 0x7d, 0x4c, 0xb6, 0x61, 0xee, 0xe3, 0x4b, 0xcd, 0x3e, 0xe9, 0x5d, 0x6d, 0x02, 0x1c, 0xf1, - 0xeb, 0x33, 0x6a, 0x1d, 0x45, 0xb0, 0xc7, 0x6d, 0xdd, 0x98, 0x51, 0x0e, 0x04, 0x9d, 0x57, 0x52, - 0x16, 0x22, 0x2f, 0x64, 0xe6, 0x27, 0x7b, 0x1d, 0x80, 0x49, 0x21, 0x33, 0xba, 0x4f, 0xea, 0x1b, - 0xe3, 0x70, 0xa2, 0x37, 0x37, 0xba, 0x6c, 0x50, 0x7e, 0x75, 0xbf, 0xf1, 0x4f, 0x15, 0x53, 0x11, - 0xfd, 0xf3, 0xef, 0x2d, 0xbf, 0x9f, 0xa0, 0xe3, 0xff, 0x1f, 0x9b, 0x18, 0xa9, 0x6d, 0x94, 0x85, - 0x4d, 0xa8, 0xce, 0x37, 0x24, 0x7d, 0x4a, 0xb6, 0x21, 0xe8, 0x68, 0xc3, 0xd5, 0x6c, 0xba, 0x4a, - 0x03, 0x8e, 0x97, 0xf4, 0x19, 0xa9, 0xf9, 0x58, 0xa3, 0x25, 0xcd, 0x3e, 0xed, 0xfd, 0x26, 0x01, - 0xf8, 0x86, 0x85, 0x7e, 0x46, 0xaa, 0xce, 0x15, 0xde, 0xb4, 0x56, 0xef, 0x5a, 0x1a, 0x70, 0x7f, - 0xe7, 0x57, 0x82, 0xea, 0x1f, 0xae, 0x04, 0x0f, 0x20, 0x7c, 0x42, 0x6a, 0x9d, 0xe5, 0xb8, 0xb0, - 0x54, 0x78, 0x2d, 0xd2, 0x43, 0x20, 0x6f, 0x78, 0xb1, 0xfe, 0x07, 0x5e, 0x7c, 0x08, 0x2e, 0x83, - 0x67, 0x52, 0xb3, 0xc4, 0xe5, 0xa5, 0xc1, 0xeb, 0xf8, 0x4e, 0x6a, 0x96, 0x30, 0x19, 0xcf, 0xa5, - 0x36, 0x2a, 0xcf, 0x70, 0x71, 0x69, 0x6e, 0x7a, 0xb0, 0x07, 0xf9, 0xe6, 0x16, 0x73, 0x18, 0x0b, - 0x10, 0x77, 0x99, 0x0a, 0xf7, 0x54, 0xf7, 0x3f, 0x25, 0xd2, 0xba, 0x2e, 0x01, 0x8b, 0x65, 0x1a, - 0xbe, 0xcb, 0xb5, 0xaf, 0x07, 0x47, 0x20, 0xaa, 0xb2, 0x5c, 0xfb, 0x1d, 0xd6, 0x11, 0x80, 0x2e, - 0x95, 0xf5, 0x5b, 0x7e, 0x83, 0x3b, 0x02, 0x0a, 0xd0, 0xac, 0x4f, 0xdd, 0xb2, 0xb5, 0xed, 0x6b, - 0xdf, 0xd3, 0x20, 0x81, 0x4b, 0x33, 0x3a, 0xb8, 0xc2, 0x1d, 0x01, 0x5b, 0x11, 0xb4, 0x5d, 0xf4, - 0x69, 0x83, 0xe3, 0x79, 0x5f, 0x78, 0xbd, 0xfc, 0x34, 0xa1, 0x84, 0x54, 0xc7, 0x6f, 0x8e, 0x27, - 0x7c, 0x18, 0x7c, 0x44, 0x9b, 0xa4, 0x36, 0x78, 0x23, 0x8e, 0x27, 0xc7, 0xc3, 0xa0, 0x44, 0x1b, - 0xa4, 0x32, 0xe5, 0x93, 0xe9, 0x2c, 0x28, 0xd3, 0x3a, 0xd9, 0x9e, 0x4d, 0x46, 0xf3, 0x60, 0x0b, - 0x4e, 0xa3, 0x93, 0xc3, 0xc3, 0x60, 0x1b, 0xe4, 0x66, 0x73, 0x3e, 0x1e, 0xcc, 0x83, 0x0a, 0xc8, - 0x1d, 0x0c, 0x47, 0xaf, 0x4f, 0x0e, 0xe7, 0x41, 0x75, 0xff, 0x97, 0x92, 0x2f, 0xd6, 0x4d, 0xc6, - 0xc1, 0x4b, 0xc3, 0xa3, 0xe9, 0xfc, 0xa7, 0xe0, 0x23, 0x90, 0x3f, 0x38, 0x39, 0x9a, 0x06, 0x25, - 0x90, 0xe1, 0xc3, 0xd9, 0x1c, 0x3e, 0x2e, 0x03, 0xc7, 0xe0, 0x87, 0xe1, 0xe0, 0xc7, 0x60, 0x8b, - 0xb6, 0x48, 0x7d, 0xca, 0x87, 0x02, 0xb9, 0xb6, 0xe9, 0x2d, 0xd2, 0x9c, 0xbe, 0x7e, 0x33, 0x14, - 0xb3, 0x21, 0x7f, 0x3b, 0xe4, 0x41, 0x05, 0xbe, 0x3d, 0x9e, 0xcc, 0xc7, 0xa3, 0x9f, 0x82, 0x2a, - 0x0d, 0x48, 0x6b, 0x30, 0x3d, 0x19, 0x1f, 0x8f, 0x26, 0x8e, 0xbd, 0x46, 0x6f, 0x93, 0xf6, 0x06, - 0x71, 0xef, 0xd5, 0x01, 0x1a, 0x0d, 0x5f, 0xcf, 0x4f, 0xf8, 0xd0, 0x43, 0x0d, 0xf8, 0xfa, 0xed, - 0x90, 0xcf, 0xc6, 0x93, 0xe3, 0x80, 0xc0, 0x7f, 0xff, 0x7c, 0x3d, 0x9e, 0x8b, 0xe9, 0xf8, 0x20, - 0x68, 0xd2, 0xbb, 0x24, 0xb8, 0xf6, 0x9f, 0x18, 0xfc, 0x70, 0x78, 0x10, 0xb4, 0xfe, 0x17, 0x00, - 0x00, 0xff, 0xff, 0xf8, 0x9f, 0x0e, 0x7d, 0xca, 0x0d, 0x00, 0x00, + // 1781 bytes of a gzipped FileDescriptorProto + 0x1f, 0x8b, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0xff, 0x8c, 0x56, 0xdd, 0x72, 0x5b, 0xb7, + 0x11, 0x0e, 0x29, 0xfe, 0x1c, 0x82, 0x3f, 0xa6, 0x10, 0xdb, 0x81, 0x93, 0xda, 0x62, 0xe8, 0x28, + 0x51, 0x15, 0x97, 0x4d, 0x58, 0x3b, 0xae, 0x33, 0xed, 0x85, 0x47, 0x22, 0x5d, 0x36, 0x92, 0xc8, + 0x01, 0x25, 0xcf, 0xe4, 0x0a, 0x73, 0x74, 0x0e, 0x48, 0xc1, 0x3c, 0x7f, 0x05, 0x40, 0x45, 0xf2, + 0x83, 0xf4, 0x29, 0xfa, 0x0c, 0x7d, 0x84, 0xbe, 0x4e, 0x6f, 0x3b, 0xbb, 0x00, 0x65, 0x29, 0xc9, + 0xb4, 0xbd, 0xc3, 0x7e, 0x58, 0x00, 0xbb, 0xfb, 0xed, 0x0f, 0x48, 0x3b, 0xd2, 0x6a, 0xad, 0x8b, + 0x68, 0x50, 0xe8, 0xdc, 0xe6, 0xfd, 0x25, 0x79, 0x00, 0x80, 0x28, 0xc2, 0xa5, 0x14, 0x46, 0xea, + 0x4b, 0xa9, 0x85, 0xca, 0x16, 0x39, 0x65, 0xa4, 0x1e, 0xc6, 0xb1, 0x96, 0xc6, 0xb0, 0x52, 0xaf, + 0xb4, 0xd7, 0xe0, 0x1b, 0x91, 0x52, 0x52, 0x29, 0x72, 0x6d, 0x59, 0xb9, 0x57, 0xda, 0xab, 0x72, + 0x5c, 0xd3, 0x2e, 0xd9, 0x2a, 0x54, 0xcc, 0xb6, 0x10, 0x82, 0x25, 0xed, 0x90, 0xf2, 0x22, 0x66, + 0x15, 0x04, 0xca, 0x8b, 0xb8, 0xff, 0x27, 0xd2, 0xc1, 0x87, 0x2e, 0xa5, 0xbd, 0x10, 0x45, 0xa8, + 0x34, 0xfd, 0x98, 0x54, 0xd5, 0x42, 0xa8, 0x8c, 0x95, 0x7a, 0xe5, 0xbd, 0x06, 0xaf, 0xa8, 0xc5, + 0x24, 0xa3, 0x0f, 0x48, 0x4d, 0x2d, 0x44, 0xbe, 0x86, 0xeb, 0x01, 0xad, 0xaa, 0xc5, 0x74, 0x6d, + 0xfb, 0x7f, 0x20, 0x6d, 0x79, 0x65, 0x45, 0x9a, 0xaf, 0x33, 0x2b, 0xd2, 0xb0, 0x80, 0x07, 0x57, + 0xf2, 0xda, 0x1f, 0x85, 0x25, 0x20, 0x97, 0x61, 0xe2, 0x8f, 0xc1, 0xb2, 0xff, 0x96, 0x74, 0xde, + 0xe5, 0x2a, 0x13, 0x59, 0x98, 0x4a, 0x53, 0x84, 0x91, 0x04, 0xa3, 0x32, 0xe3, 0x0f, 0x95, 0x33, + 0x43, 0x3f, 0x21, 0xf5, 0xcc, 0x88, 0x85, 0x4a, 0xa4, 0x3f, 0x57, 0xcb, 0xcc, 0x58, 0x25, 0x92, + 0x7e, 0x46, 0x1a, 0xf2, 0xca, 0xea, 0x50, 0xe4, 0x85, 0x45, 0xaf, 0x1a, 0x3c, 0x40, 0x60, 0x5a, + 0xd8, 0xfe, 0x80, 0x10, 0x95, 0x5d, 0x48, 0xad, 0xac, 0x58, 0xc4, 0xbf, 0x62, 0x89, 0x73, 0x1d, + 0x2e, 0x74, 0xae, 0xbf, 0x20, 0xcd, 0x68, 0xa9, 0xf3, 0x75, 0x21, 0x74, 0x9e, 0x5b, 0x88, 0x5f, + 0x64, 0x75, 0xe2, 0xc3, 0x8a, 0x6b, 0x8c, 0x69, 0x68, 0x2f, 0xbc, 0x15, 0xb8, 0xee, 0xef, 0x90, + 0xfa, 0x3a, 0x53, 0x57, 0xc2, 0xac, 0xe8, 0x7d, 0x52, 0x55, 0x59, 0x1e, 0x4b, 0x7c, 0xa5, 0xcd, + 0x9d, 0xd0, 0xff, 0x57, 0x9b, 0x34, 0x30, 0xa6, 0x79, 0x61, 0x0d, 0xed, 0x93, 0xb6, 0x4a, 0xc3, + 0xa5, 0x34, 0x22, 0x56, 0x5a, 0x2c, 0x62, 0xd4, 0xad, 0xf2, 0xa6, 0x03, 0x0f, 0x95, 0x1e, 0xc7, + 0x1b, 0x9a, 0xca, 0x1f, 0x68, 0x7a, 0x4a, 0xda, 0x89, 0x0c, 0x2f, 0xa5, 0xd0, 0xeb, 0x2c, 0x53, + 0xd9, 0x12, 0x9d, 0x0d, 0x78, 0x0b, 0x41, 0xee, 0x30, 0xfa, 0x84, 0x34, 0x21, 0xfa, 0xde, 0x1a, + 0x24, 0x35, 0xe0, 0x10, 0xa0, 0xb3, 0x4c, 0x5d, 0xcd, 0x57, 0xf4, 0x2b, 0x72, 0xcf, 0x46, 0x85, + 0x90, 0xc6, 0x86, 0xe7, 0x89, 0x32, 0x17, 0x32, 0x66, 0x55, 0xd4, 0xe9, 0xd8, 0xa8, 0x18, 0x7d, + 0x40, 0x41, 0x51, 0x5e, 0x86, 0x46, 0x5d, 0x4a, 0x11, 0xcb, 0x4b, 0x15, 0x49, 0xc3, 0x6a, 0x4e, + 0xd1, 0xc3, 0x87, 0x0e, 0x85, 0xf8, 0x9b, 0x0b, 0x99, 0x24, 0xe2, 0x5d, 0x7e, 0xce, 0xea, 0xa8, + 0x12, 0x20, 0xf0, 0xd7, 0xfc, 0x9c, 0x3e, 0x26, 0x04, 0x28, 0x13, 0x49, 0x1e, 0xad, 0x0c, 0x0b, + 0x9c, 0x35, 0x80, 0x1c, 0x01, 0x40, 0x9f, 0x90, 0x46, 0x92, 0x2f, 0x45, 0x22, 0x2f, 0x65, 0xc2, + 0x1a, 0xe0, 0xea, 0xf7, 0xa5, 0x21, 0x0f, 0x92, 0x7c, 0x79, 0x04, 0x10, 0x7d, 0x44, 0x60, 0xed, + 0x58, 0x27, 0x2e, 0xb5, 0x93, 0x7c, 0x89, 0xb4, 0x7f, 0x49, 0xca, 0x85, 0x61, 0xcd, 0x5e, 0x69, + 0xaf, 0x39, 0x7c, 0x38, 0xf8, 0xd5, 0xc2, 0xe0, 0xe5, 0xc2, 0xd0, 0x5d, 0xd2, 0xc9, 0x72, 0xab, + 0x16, 0xd7, 0xc2, 0x44, 0x5a, 0x15, 0xd6, 0xb0, 0x16, 0x5a, 0xd1, 0x76, 0xe8, 0xdc, 0x81, 0xc0, + 0x2a, 0x30, 0xce, 0xda, 0x8e, 0x69, 0x64, 0xff, 0x31, 0x21, 0x45, 0xa8, 0x65, 0x66, 0x85, 0x4a, + 0x97, 0xac, 0x83, 0x3b, 0x0d, 0x87, 0x4c, 0xd2, 0x25, 0x38, 0x6e, 0x75, 0x18, 0xad, 0x44, 0x2a, + 0x53, 0x76, 0xcf, 0x39, 0x8e, 0xc0, 0xb1, 0x4c, 0xe1, 0x6c, 0xb8, 0xb6, 0xb9, 0x88, 0x65, 0xbc, + 0x2e, 0x58, 0xd7, 0x39, 0x0e, 0xc8, 0x21, 0x00, 0x40, 0xd3, 0x4f, 0xb9, 0x5e, 0x6d, 0xf8, 0xdf, + 0x46, 0x96, 0x1b, 0x00, 0x39, 0xf6, 0x1f, 0x13, 0x92, 0xa8, 0x6c, 0x25, 0xb4, 0x4c, 0xc3, 0x82, + 0x51, 0x77, 0x1c, 0x10, 0x0e, 0x00, 0xdd, 0x25, 0x55, 0x28, 0x4e, 0xc3, 0x3e, 0xee, 0x6d, 0xed, + 0x35, 0x87, 0xf7, 0x06, 0x77, 0xeb, 0x95, 0xbb, 0x5d, 0xfa, 0x94, 0xd4, 0xa3, 0x62, 0x2d, 0xa2, + 0xb0, 0x60, 0xf7, 0x7b, 0xa5, 0xbd, 0xf6, 0xf7, 0xe4, 0xf9, 0xf0, 0xd5, 0xf3, 0x57, 0xdf, 0xbd, + 0x1c, 0xbe, 0x7a, 0xc1, 0x6b, 0x51, 0xb1, 0x3e, 0x08, 0x0b, 0xba, 0x43, 0x9a, 0x8b, 0x5c, 0x47, + 0x52, 0x28, 0x0d, 0x6f, 0x3d, 0xc0, 0xb7, 0x08, 0x42, 0x13, 0x40, 0x80, 0x04, 0x79, 0x25, 0x23, + 0x11, 0xa5, 0x31, 0x7b, 0xd8, 0xdb, 0x02, 0x12, 0x40, 0x3e, 0x48, 0x21, 0x49, 0xea, 0x58, 0xeb, + 0x99, 0x65, 0x9f, 0xa0, 0x25, 0x9d, 0xc1, 0x9d, 0xda, 0xe7, 0x35, 0x79, 0x65, 0x8f, 0x33, 0x0b, + 0x2c, 0xa4, 0x61, 0x06, 0xfc, 0xb8, 0xf2, 0x32, 0x8c, 0x39, 0x16, 0x1c, 0x7a, 0xe0, 0x40, 0xba, + 0x4b, 0xea, 0xd1, 0x12, 0x4b, 0x8f, 0x3d, 0xc2, 0xfb, 0x5a, 0x83, 0x5b, 0xe5, 0xc8, 0x6b, 0xd1, + 0x92, 0x03, 0x31, 0x3b, 0xa4, 0xa9, 0x8d, 0x15, 0x46, 0x9d, 0x27, 0x50, 0x07, 0x9f, 0x3a, 0x93, + 0xb5, 0xb1, 0x73, 0x87, 0xd0, 0xfd, 0xdb, 0x65, 0xcf, 0x3e, 0xc3, 0xab, 0x9a, 0x83, 0x0f, 0x10, + 0x6f, 0xf8, 0xf5, 0x38, 0xa6, 0x3d, 0xd2, 0x42, 0xa6, 0x36, 0x8e, 0xfc, 0xc6, 0xdd, 0x06, 0xd8, + 0xc8, 0x19, 0xbf, 0xe3, 0x6a, 0xca, 0x5c, 0x84, 0x1a, 0x9e, 0x7b, 0xec, 0x14, 0xe4, 0x95, 0x9d, + 0x3b, 0x64, 0xa3, 0x90, 0x86, 0xc6, 0x4a, 0x6d, 0xd8, 0x93, 0x1b, 0x85, 0x63, 0x87, 0x40, 0x08, + 0xcd, 0x4a, 0x15, 0x78, 0xff, 0x8e, 0x0b, 0x21, 0xc8, 0x70, 0x39, 0xb4, 0xaf, 0x2c, 0x3c, 0x4f, + 0xa4, 0x58, 0x18, 0xd6, 0xc3, 0xbd, 0xc0, 0x01, 0x63, 0x43, 0xf7, 0x48, 0xd3, 0x57, 0xb2, 0x50, + 0x59, 0xce, 0x3e, 0x47, 0x47, 0x82, 0x81, 0xc7, 0x78, 0x63, 0x8d, 0x45, 0x3d, 0xc9, 0x72, 0xfa, + 0x67, 0xf2, 0xf1, 0xdd, 0x00, 0x8b, 0x14, 0x9a, 0x50, 0xbf, 0x57, 0xda, 0xeb, 0x0c, 0xdb, 0x2e, + 0x3f, 0xa2, 0x25, 0x82, 0x7c, 0xfb, 0x4e, 0xd0, 0x8f, 0xf3, 0x58, 0xc2, 0x43, 0xcb, 0x8b, 0xdc, + 0x58, 0x91, 0xa8, 0x54, 0x59, 0xf6, 0x14, 0xb3, 0xa5, 0xfe, 0xed, 0x37, 0xcf, 0xff, 0xf8, 0xe2, + 0xe5, 0x77, 0x9c, 0xe0, 0xde, 0x11, 0x6c, 0xd1, 0x3d, 0xd2, 0xc5, 0x44, 0x11, 0x26, 0x0a, 0x33, + 0x01, 0xdd, 0xcf, 0xb0, 0x2f, 0xd0, 0xec, 0x0e, 0xe2, 0xf3, 0x28, 0xcc, 0x66, 0x80, 0xd2, 0x4f, + 0x21, 0x6f, 0xac, 0xd4, 0x59, 0x98, 0xb0, 0x5d, 0xef, 0x98, 0x97, 0x31, 0xa7, 0xd2, 0xc2, 0x5e, + 0x8b, 0xcc, 0xb0, 0x2f, 0xe1, 0x31, 0x5e, 0x47, 0xf9, 0x04, 0x7c, 0xae, 0xbb, 0x51, 0x60, 0xd8, + 0x57, 0x3e, 0xbb, 0xef, 0x8e, 0x06, 0x5e, 0x03, 0xf9, 0xc4, 0xd0, 0xcf, 0x49, 0xcb, 0x67, 0x47, + 0xa1, 0xf3, 0xc2, 0xb0, 0xdf, 0x62, 0x85, 0xfa, 0x06, 0x3e, 0x03, 0x88, 0xee, 0x93, 0xed, 0xdb, + 0x2a, 0xae, 0x93, 0xec, 0xa3, 0xde, 0xbd, 0x5b, 0x7a, 0xd8, 0x51, 0x9e, 0x93, 0x87, 0x5e, 0x37, + 0x5e, 0xa7, 0x85, 0x88, 0xf2, 0xcc, 0xea, 0x3c, 0x49, 0xa4, 0x66, 0x5f, 0xa3, 0xf5, 0xf7, 0xdd, + 0xee, 0xe1, 0x3a, 0x2d, 0x0e, 0x6e, 0xf6, 0xa0, 0x2b, 0x2f, 0xb4, 0x94, 0xef, 0x37, 0x81, 0x67, + 0xcf, 0xf0, 0xf6, 0x96, 0x03, 0x5d, 0x8c, 0x61, 0x42, 0x5b, 0x95, 0x4a, 0x98, 0x95, 0xbf, 0x73, + 0xde, 0x7a, 0x91, 0x7e, 0x4d, 0x28, 0xf4, 0x63, 0xcc, 0x0e, 0x95, 0x89, 0x45, 0xa2, 0x96, 0x17, + 0x96, 0x0d, 0x30, 0x83, 0xa0, 0x53, 0xcf, 0x57, 0xaa, 0x98, 0x64, 0x63, 0x84, 0xc1, 0xe1, 0x9f, + 0x64, 0xb8, 0x12, 0xe6, 0xda, 0x44, 0x36, 0x31, 0xec, 0xf7, 0xa8, 0xd6, 0x04, 0x6c, 0xee, 0x20, + 0x6c, 0x1c, 0xe1, 0xfb, 0x6b, 0xec, 0x85, 0x86, 0x7d, 0xe3, 0x1b, 0x47, 0xf8, 0xfe, 0x7a, 0x06, + 0x00, 0x36, 0x6b, 0x1b, 0xda, 0xb5, 0x81, 0xba, 0xf8, 0x16, 0xbb, 0x4e, 0xe0, 0x80, 0x71, 0x0c, + 0xc1, 0xca, 0x75, 0x71, 0x01, 0xb4, 0x5a, 0xe3, 0xb3, 0x99, 0x0d, 0x9d, 0x29, 0x6e, 0x63, 0x66, + 0x8d, 0x4b, 0xe9, 0xfe, 0x33, 0xff, 0x47, 0xc0, 0x50, 0x69, 0x69, 0x0a, 0xa0, 0x5b, 0x4b, 0x63, + 0x73, 0x2d, 0x63, 0x9c, 0x97, 0x01, 0xbf, 0x91, 0xfb, 0xbb, 0x64, 0x1b, 0xb5, 0x3d, 0xe0, 0x0e, + 0xf8, 0x09, 0xe7, 0x66, 0x1f, 0x2c, 0xfb, 0x2f, 0x49, 0x13, 0xd5, 0x5c, 0x6b, 0xa6, 0x0f, 0x49, + 0xcd, 0xf5, 0x6c, 0x3f, 0x7f, 0xbd, 0xf4, 0xcb, 0xd1, 0xd8, 0xff, 0xc1, 0xfd, 0x95, 0xc4, 0x42, + 0x86, 0x76, 0xad, 0x9d, 0x9f, 0xa9, 0x4c, 0x05, 0xb6, 0xe3, 0x8d, 0x35, 0xa9, 0x4c, 0x4f, 0x41, + 0xfe, 0x59, 0x8c, 0xca, 0x3f, 0x8b, 0x51, 0xff, 0x9f, 0x25, 0x12, 0x78, 0x6b, 0xff, 0x46, 0xfb, + 0xa4, 0x62, 0xaf, 0x0b, 0x37, 0xcd, 0x3b, 0xc3, 0xce, 0x60, 0xb3, 0x21, 0x00, 0xe5, 0xb8, 0x47, + 0x9f, 0x90, 0x0a, 0x8c, 0x75, 0xbc, 0xa9, 0x39, 0x24, 0x83, 0x9b, 0x41, 0xcf, 0x11, 0xbf, 0x3d, + 0x82, 0xd6, 0x51, 0x04, 0xdf, 0xb4, 0xad, 0x3b, 0x23, 0xc8, 0x81, 0x60, 0xf3, 0x4a, 0xca, 0x42, + 0xe4, 0x85, 0xcc, 0xfc, 0xe0, 0x0e, 0x00, 0x98, 0x16, 0x32, 0xa3, 0xfb, 0x24, 0xd8, 0x38, 0x87, + 0x03, 0xbb, 0xb9, 0xb1, 0x65, 0x83, 0xf2, 0x9b, 0xfd, 0xfe, 0xbf, 0xcb, 0xfe, 0xb3, 0x81, 0x61, + 0xfe, 0x7f, 0x3c, 0x60, 0xa4, 0xbe, 0x31, 0x0d, 0xbe, 0x35, 0x01, 0xdf, 0x88, 0xf4, 0x29, 0xa9, + 0x00, 0xc5, 0x68, 0xf1, 0xcd, 0xa0, 0xb9, 0x21, 0x9d, 0xe3, 0x26, 0x7d, 0x46, 0xea, 0x9e, 0x59, + 0xb4, 0xbb, 0x39, 0xa4, 0x83, 0x5f, 0xd0, 0xcd, 0x37, 0x2a, 0xf4, 0x0b, 0x52, 0x73, 0x8e, 0x7b, + 0x47, 0x5a, 0x83, 0x5b, 0xa4, 0x73, 0xbf, 0xe7, 0xe7, 0x7b, 0xed, 0x7f, 0xce, 0xf7, 0x47, 0x40, + 0x96, 0x90, 0x5a, 0x67, 0x39, 0xfe, 0x3e, 0xaa, 0xbc, 0x1e, 0xe9, 0x11, 0x88, 0x77, 0x62, 0x16, + 0xfc, 0xf7, 0x98, 0x41, 0xf0, 0xdd, 0x35, 0xa9, 0x59, 0xe2, 0x4f, 0xa4, 0xc1, 0x03, 0xbc, 0x27, + 0x35, 0x4b, 0x18, 0x73, 0x97, 0x52, 0x1b, 0x95, 0x67, 0xf8, 0x0b, 0x69, 0x6e, 0x1a, 0xaa, 0x07, + 0xf9, 0x66, 0xb7, 0xff, 0xf7, 0x12, 0x69, 0xdd, 0xde, 0x81, 0xdf, 0x60, 0x1a, 0xbe, 0xcb, 0xb5, + 0xcf, 0x72, 0x27, 0x20, 0xaa, 0xb2, 0x5c, 0xfb, 0x8f, 0xa7, 0x13, 0x00, 0x5d, 0x2a, 0xeb, 0xbf, + 0xe6, 0x0d, 0xee, 0x04, 0x28, 0x2b, 0xb3, 0x3e, 0x77, 0x3f, 0xa4, 0x8a, 0x2f, 0x58, 0x2f, 0xc3, + 0x09, 0xfc, 0xe9, 0x62, 0x20, 0xab, 0xdc, 0x09, 0xf0, 0x95, 0x81, 0x5e, 0x89, 0xb1, 0x6b, 0x70, + 0x5c, 0xef, 0x0b, 0x6f, 0x97, 0x1f, 0x01, 0x94, 0x90, 0xda, 0xe4, 0xcd, 0xc9, 0x94, 0x8f, 0xba, + 0x1f, 0xd1, 0x26, 0xa9, 0x1f, 0xbc, 0x11, 0x27, 0xd3, 0x93, 0x51, 0xb7, 0x44, 0x1b, 0xa4, 0x3a, + 0xe3, 0xd3, 0xd9, 0xbc, 0x5b, 0xa6, 0x01, 0xa9, 0xcc, 0xa7, 0xe3, 0xd3, 0xee, 0x16, 0xac, 0xc6, + 0x67, 0x47, 0x47, 0xdd, 0x0a, 0x9c, 0x9b, 0x9f, 0xf2, 0xc9, 0xc1, 0x69, 0xb7, 0x0a, 0xe7, 0x0e, + 0x47, 0xe3, 0xd7, 0x67, 0x47, 0xa7, 0xdd, 0xda, 0xfe, 0x3f, 0x4a, 0xbe, 0x04, 0x37, 0x99, 0x05, + 0x37, 0x8d, 0x8e, 0x67, 0xa7, 0x3f, 0x76, 0x3f, 0x82, 0xf3, 0x87, 0x67, 0xc7, 0xb3, 0x6e, 0x09, + 0xce, 0xf0, 0xd1, 0xfc, 0x14, 0x1e, 0x2e, 0x83, 0xc6, 0xc1, 0x5f, 0x46, 0x07, 0x3f, 0x74, 0xb7, + 0x68, 0x8b, 0x04, 0x33, 0x3e, 0x12, 0xa8, 0x55, 0xa1, 0xf7, 0x48, 0x73, 0xf6, 0xfa, 0xcd, 0x48, + 0xcc, 0x47, 0xfc, 0xed, 0x88, 0x77, 0xab, 0xf0, 0xec, 0xc9, 0xf4, 0x74, 0x32, 0xfe, 0xb1, 0x5b, + 0xa3, 0x5d, 0xd2, 0x3a, 0x98, 0x9d, 0x4d, 0x4e, 0xc6, 0x53, 0xa7, 0x5e, 0xa7, 0xdb, 0xa4, 0xbd, + 0x41, 0xdc, 0x7d, 0x01, 0x40, 0xe3, 0xd1, 0xeb, 0xd3, 0x33, 0x3e, 0xf2, 0x50, 0x03, 0x9e, 0x7e, + 0x3b, 0xe2, 0xf3, 0xc9, 0xf4, 0xa4, 0x4b, 0xfe, 0x13, 0x00, 0x00, 0xff, 0xff, 0x5f, 0x2a, 0xaf, + 0x49, 0x5b, 0x0d, 0x00, 0x00, } diff --git a/vendor/github.com/opencontainers/runc/libcontainer/factory_linux.go b/vendor/github.com/opencontainers/runc/libcontainer/factory_linux.go index e35957c314..7d53d5e04d 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/factory_linux.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/factory_linux.go @@ -11,7 +11,6 @@ import ( "runtime/debug" "strconv" - "github.com/cyphar/filepath-securejoin" "github.com/opencontainers/runc/libcontainer/cgroups" "github.com/opencontainers/runc/libcontainer/cgroups/fs" "github.com/opencontainers/runc/libcontainer/cgroups/systemd" @@ -60,9 +59,9 @@ func SystemdCgroups(l *LinuxFactory) error { return nil } -// Cgroupfs is an options func to configure a LinuxFactory to return containers -// that use the native cgroups filesystem implementation to create and manage -// cgroups. +// Cgroupfs is an options func to configure a LinuxFactory to return +// containers that use the native cgroups filesystem implementation to +// create and manage cgroups. func Cgroupfs(l *LinuxFactory) error { l.NewCgroupsManager = func(config *configs.Cgroup, paths map[string]string) cgroups.Manager { return &fs.Manager{ @@ -73,26 +72,9 @@ func Cgroupfs(l *LinuxFactory) error { return nil } -// RootlessCgroupfs is an options func to configure a LinuxFactory to return -// containers that use the native cgroups filesystem implementation to create -// and manage cgroups. The difference between RootlessCgroupfs and Cgroupfs is -// that RootlessCgroupfs can transparently handle permission errors that occur -// during rootless container (including euid=0 in userns) setup (while still allowing cgroup usage if -// they've been set up properly). -func RootlessCgroupfs(l *LinuxFactory) error { - l.NewCgroupsManager = func(config *configs.Cgroup, paths map[string]string) cgroups.Manager { - return &fs.Manager{ - Cgroups: config, - Rootless: true, - Paths: paths, - } - } - return nil -} - // IntelRdtfs is an options func to configure a LinuxFactory to return // containers that use the Intel RDT "resource control" filesystem to -// create and manage Intel RDT resources (e.g., L3 cache, memory bandwidth). +// create and manage Intel Xeon platform shared resources (e.g., L3 cache). func IntelRdtFs(l *LinuxFactory) error { l.NewIntelRdtManager = func(config *configs.Config, id string, path string) intelrdt.Manager { return &intelrdt.IntelRdtManager{ @@ -196,10 +178,7 @@ func (l *LinuxFactory) Create(id string, config *configs.Config) (Container, err if err := l.Validator.Validate(config); err != nil { return nil, newGenericError(err, ConfigInvalid) } - containerRoot, err := securejoin.SecureJoin(l.Root, id) - if err != nil { - return nil, err - } + containerRoot := filepath.Join(l.Root, id) if _, err := os.Stat(containerRoot); err == nil { return nil, newGenericError(fmt.Errorf("container with id exists: %v", id), IdInUse) } else if !os.IsNotExist(err) { @@ -222,7 +201,7 @@ func (l *LinuxFactory) Create(id string, config *configs.Config) (Container, err newgidmapPath: l.NewgidmapPath, cgroupManager: l.NewCgroupsManager(config.Cgroups, nil), } - if intelrdt.IsCatEnabled() || intelrdt.IsMbaEnabled() { + if intelrdt.IsEnabled() { c.intelRdtManager = l.NewIntelRdtManager(config, id, "") } c.state = &stoppedState{c: c} @@ -233,14 +212,7 @@ func (l *LinuxFactory) Load(id string) (Container, error) { if l.Root == "" { return nil, newGenericError(fmt.Errorf("invalid root"), ConfigInvalid) } - //when load, we need to check id is valid or not. - if err := l.validateID(id); err != nil { - return nil, err - } - containerRoot, err := securejoin.SecureJoin(l.Root, id) - if err != nil { - return nil, err - } + containerRoot := filepath.Join(l.Root, id) state, err := l.loadState(containerRoot, id) if err != nil { return nil, err @@ -268,7 +240,7 @@ func (l *LinuxFactory) Load(id string) (Container, error) { if err := c.refreshState(); err != nil { return nil, err } - if intelrdt.IsCatEnabled() || intelrdt.IsMbaEnabled() { + if intelrdt.IsEnabled() { c.intelRdtManager = l.NewIntelRdtManager(&state.Config, id, state.IntelRdtPath) } return c, nil @@ -350,11 +322,7 @@ func (l *LinuxFactory) StartInitialization() (err error) { } func (l *LinuxFactory) loadState(root, id string) (*State, error) { - stateFilePath, err := securejoin.SecureJoin(root, stateFilename) - if err != nil { - return nil, err - } - f, err := os.Open(stateFilePath) + f, err := os.Open(filepath.Join(root, stateFilename)) if err != nil { if os.IsNotExist(err) { return nil, newGenericError(fmt.Errorf("container %q does not exist", id), ContainerNotExists) @@ -370,7 +338,7 @@ func (l *LinuxFactory) loadState(root, id string) (*State, error) { } func (l *LinuxFactory) validateID(id string) error { - if !idRegex.MatchString(id) || string(os.PathSeparator)+id != utils.CleanPath(string(os.PathSeparator)+id) { + if !idRegex.MatchString(id) { return newGenericError(fmt.Errorf("invalid id format: %v", id), InvalidIdFormat) } diff --git a/vendor/github.com/opencontainers/runc/libcontainer/init_linux.go b/vendor/github.com/opencontainers/runc/libcontainer/init_linux.go index cd7ff67a70..2770be3071 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/init_linux.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/init_linux.go @@ -6,7 +6,6 @@ import ( "encoding/json" "fmt" "io" - "io/ioutil" "net" "os" "strings" @@ -21,7 +20,6 @@ import ( "github.com/opencontainers/runc/libcontainer/system" "github.com/opencontainers/runc/libcontainer/user" "github.com/opencontainers/runc/libcontainer/utils" - "github.com/pkg/errors" "github.com/sirupsen/logrus" "github.com/vishvananda/netlink" ) @@ -66,8 +64,7 @@ type initConfig struct { CreateConsole bool `json:"create_console"` ConsoleWidth uint16 `json:"console_width"` ConsoleHeight uint16 `json:"console_height"` - RootlessEUID bool `json:"rootless_euid,omitempty"` - RootlessCgroups bool `json:"rootless_cgroups,omitempty"` + Rootless bool `json:"rootless"` } type initer interface { @@ -124,7 +121,7 @@ func finalizeNamespace(config *initConfig) error { // inherited are marked close-on-exec so they stay out of the // container if err := utils.CloseExecFrom(config.PassedFilesCount + 3); err != nil { - return errors.Wrap(err, "close exec fds") + return err } capabilities := &configs.Capabilities{} @@ -139,20 +136,20 @@ func finalizeNamespace(config *initConfig) error { } // drop capabilities in bounding set before changing user if err := w.ApplyBoundingSet(); err != nil { - return errors.Wrap(err, "apply bounding set") + return err } // preserve existing capabilities while we change users if err := system.SetKeepCaps(); err != nil { - return errors.Wrap(err, "set keep caps") + return err } if err := setupUser(config); err != nil { - return errors.Wrap(err, "setup user") + return err } if err := system.ClearKeepCaps(); err != nil { - return errors.Wrap(err, "clear keep caps") + return err } if err := w.ApplyCaps(); err != nil { - return errors.Wrap(err, "apply caps") + return err } if config.Cwd != "" { if err := unix.Chdir(config.Cwd); err != nil { @@ -220,7 +217,11 @@ func syncParentReady(pipe io.ReadWriter) error { } // Wait for parent to give the all-clear. - return readSync(pipe, procRun) + if err := readSync(pipe, procRun); err != nil { + return err + } + + return nil } // syncParentHooks sends to the given pipe a JSON payload which indicates that @@ -233,7 +234,11 @@ func syncParentHooks(pipe io.ReadWriter) error { } // Wait for parent to give the all-clear. - return readSync(pipe, procResume) + if err := readSync(pipe, procResume); err != nil { + return err + } + + return nil } // setupUser changes the groups, gid, and uid for the user inside the container @@ -277,7 +282,7 @@ func setupUser(config *initConfig) error { return fmt.Errorf("cannot set gid to unmapped user in user namespace") } - if config.RootlessEUID { + if config.Rootless { // We cannot set any additional groups in a rootless container and thus // we bail if the user asked us to do so. TODO: We currently can't do // this check earlier, but if libcontainer.Process.User was typesafe @@ -293,18 +298,11 @@ func setupUser(config *initConfig) error { return err } - setgroups, err := ioutil.ReadFile("/proc/self/setgroups") - if err != nil && !os.IsNotExist(err) { - return err - } - // This isn't allowed in an unprivileged user namespace since Linux 3.19. // There's nothing we can do about /etc/group entries, so we silently // ignore setting groups here (since the user didn't explicitly ask us to // set the group). - allowSupGroups := !config.RootlessEUID && strings.TrimSpace(string(setgroups)) != "deny" - - if allowSupGroups { + if !config.Rootless { suppGroups := append(execUser.Sgids, addGroups...) if err := unix.Setgroups(suppGroups); err != nil { return err diff --git a/vendor/github.com/opencontainers/runc/libcontainer/intelrdt/intelrdt.go b/vendor/github.com/opencontainers/runc/libcontainer/intelrdt/intelrdt.go index 0071ce7557..487c630af6 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/intelrdt/intelrdt.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/intelrdt/intelrdt.go @@ -16,26 +16,20 @@ import ( ) /* - * About Intel RDT features: + * About Intel RDT/CAT feature: * Intel platforms with new Xeon CPU support Resource Director Technology (RDT). - * Cache Allocation Technology (CAT) and Memory Bandwidth Allocation (MBA) are - * two sub-features of RDT. + * Intel Cache Allocation Technology (CAT) is a sub-feature of RDT. Currently L3 + * Cache is the only resource that is supported in RDT. * - * Cache Allocation Technology (CAT) provides a way for the software to restrict - * cache allocation to a defined 'subset' of L3 cache which may be overlapping - * with other 'subsets'. The different subsets are identified by class of - * service (CLOS) and each CLOS has a capacity bitmask (CBM). + * This feature provides a way for the software to restrict cache allocation to a + * defined 'subset' of L3 cache which may be overlapping with other 'subsets'. + * The different subsets are identified by class of service (CLOS) and each CLOS + * has a capacity bitmask (CBM). * - * Memory Bandwidth Allocation (MBA) provides indirect and approximate throttle - * over memory bandwidth for the software. A user controls the resource by - * indicating the percentage of maximum memory bandwidth or memory bandwidth - * limit in MBps unit if MBA Software Controller is enabled. + * For more information about Intel RDT/CAT can be found in the section 17.17 + * of Intel Software Developer Manual. * - * More details about Intel RDT CAT and MBA can be found in the section 17.18 - * of Intel Software Developer Manual: - * https://software.intel.com/en-us/articles/intel-sdm - * - * About Intel RDT kernel interface: + * About Intel RDT/CAT kernel interface: * In Linux 4.10 kernel or newer, the interface is defined and exposed via * "resource control" filesystem, which is a "cgroup-like" interface. * @@ -43,98 +37,59 @@ import ( * interfaces in a container. But unlike cgroups' hierarchy, it has single level * filesystem layout. * - * CAT and MBA features are introduced in Linux 4.10 and 4.12 kernel via - * "resource control" filesystem. - * * Intel RDT "resource control" filesystem hierarchy: * mount -t resctrl resctrl /sys/fs/resctrl * tree /sys/fs/resctrl * /sys/fs/resctrl/ * |-- info * | |-- L3 - * | | |-- cbm_mask - * | | |-- min_cbm_bits - * | | |-- num_closids - * | |-- MB - * | |-- bandwidth_gran - * | |-- delay_linear - * | |-- min_bandwidth + * | |-- cbm_mask + * | |-- min_cbm_bits * | |-- num_closids - * |-- ... + * |-- cpus * |-- schemata * |-- tasks * |-- - * |-- ... + * |-- cpus * |-- schemata * |-- tasks * - * For runc, we can make use of `tasks` and `schemata` configuration for L3 - * cache and memory bandwidth resources constraints. + * For runc, we can make use of `tasks` and `schemata` configuration for L3 cache + * resource constraints. * - * The file `tasks` has a list of tasks that belongs to this group (e.g., + * The file `tasks` has a list of tasks that belongs to this group (e.g., * " group). Tasks can be added to a group by writing the task ID - * to the "tasks" file (which will automatically remove them from the previous + * to the "tasks" file (which will automatically remove them from the previous * group to which they belonged). New tasks created by fork(2) and clone(2) are - * added to the same group as their parent. + * added to the same group as their parent. If a pid is not in any sub group, it is + * in root group. * - * The file `schemata` has a list of all the resources available to this group. - * Each resource (L3 cache, memory bandwidth) has its own line and format. - * - * L3 cache schema: - * It has allocation bitmasks/values for L3 cache on each socket, which - * contains L3 cache id and capacity bitmask (CBM). + * The file `schemata` has allocation bitmasks/values for L3 cache on each socket, + * which contains L3 cache id and capacity bitmask (CBM). * Format: "L3:=;=;..." - * For example, on a two-socket machine, the schema line could be "L3:0=ff;1=c0" + * For example, on a two-socket machine, L3's schema line could be `L3:0=ff;1=c0` * which means L3 cache id 0's CBM is 0xff, and L3 cache id 1's CBM is 0xc0. * * The valid L3 cache CBM is a *contiguous bits set* and number of bits that can * be set is less than the max bit. The max bits in the CBM is varied among - * supported Intel CPU models. Kernel will check if it is valid when writing. - * e.g., default value 0xfffff in root indicates the max bits of CBM is 20 - * bits, which mapping to entire L3 cache capacity. Some valid CBM values to - * set in a group: 0xf, 0xf0, 0x3ff, 0x1f00 and etc. - * - * Memory bandwidth schema: - * It has allocation values for memory bandwidth on each socket, which contains - * L3 cache id and memory bandwidth. - * Format: "MB:=bandwidth0;=bandwidth1;..." - * For example, on a two-socket machine, the schema line could be "MB:0=20;1=70" - * - * The minimum bandwidth percentage value for each CPU model is predefined and - * can be looked up through "info/MB/min_bandwidth". The bandwidth granularity - * that is allocated is also dependent on the CPU model and can be looked up at - * "info/MB/bandwidth_gran". The available bandwidth control steps are: - * min_bw + N * bw_gran. Intermediate values are rounded to the next control - * step available on the hardware. - * - * If MBA Software Controller is enabled through mount option "-o mba_MBps": - * mount -t resctrl resctrl -o mba_MBps /sys/fs/resctrl - * We could specify memory bandwidth in "MBps" (Mega Bytes per second) unit - * instead of "percentages". The kernel underneath would use a software feedback - * mechanism or a "Software Controller" which reads the actual bandwidth using - * MBM counters and adjust the memory bandwidth percentages to ensure: - * "actual memory bandwidth < user specified memory bandwidth". - * - * For example, on a two-socket machine, the schema line could be - * "MB:0=5000;1=7000" which means 5000 MBps memory bandwidth limit on socket 0 - * and 7000 MBps memory bandwidth limit on socket 1. + * supported Intel Xeon platforms. In Intel RDT "resource control" filesystem + * layout, the CBM in a group should be a subset of the CBM in root. Kernel will + * check if it is valid when writing. e.g., 0xfffff in root indicates the max bits + * of CBM is 20 bits, which mapping to entire L3 cache capacity. Some valid CBM + * values to set in a group: 0xf, 0xf0, 0x3ff, 0x1f00 and etc. * - * For more information about Intel RDT kernel interface: + * For more information about Intel RDT/CAT kernel interface: * https://www.kernel.org/doc/Documentation/x86/intel_rdt_ui.txt * * An example for runc: * Consider a two-socket machine with two L3 caches where the default CBM is - * 0x7ff and the max CBM length is 11 bits, and minimum memory bandwidth of 10% - * with a memory bandwidth granularity of 10%. - * - * Tasks inside the container only have access to the "upper" 7/11 of L3 cache - * on socket 0 and the "lower" 5/11 L3 cache on socket 1, and may use a - * maximum memory bandwidth of 20% on socket 0 and 70% on socket 1. + * 0xfffff and the max CBM length is 20 bits. With this configuration, tasks + * inside the container only have access to the "upper" 80% of L3 cache id 0 and + * the "lower" 50% L3 cache id 1: * * "linux": { - * "intelRdt": { - * "l3CacheSchema": "L3:0=7f0;1=1f", - * "memBwSchema": "MB:0=20;1=70" + * "intelRdt": { + * "l3CacheSchema": "L3:0=ffff0;1=3ff" * } * } */ @@ -174,12 +129,8 @@ var ( intelRdtRoot string intelRdtRootLock sync.Mutex - // The flag to indicate if Intel RDT/CAT is enabled - isCatEnabled bool - // The flag to indicate if Intel RDT/MBA is enabled - isMbaEnabled bool - // The flag to indicate if Intel RDT/MBA Software Controller is enabled - isMbaScEnabled bool + // The flag to indicate if Intel RDT is supported + isEnabled bool ) type intelRdtData struct { @@ -188,40 +139,19 @@ type intelRdtData struct { pid int } -// Check if Intel RDT sub-features are enabled in init() +// Check if Intel RDT is enabled in init() func init() { - // 1. Check if hardware and kernel support Intel RDT sub-features - // "cat_l3" flag for CAT and "mba" flag for MBA - isCatFlagSet, isMbaFlagSet, err := parseCpuInfoFile("/proc/cpuinfo") - if err != nil { + // 1. Check if hardware and kernel support Intel RDT/CAT feature + // "cat_l3" flag is set if supported + isFlagSet, err := parseCpuInfoFile("/proc/cpuinfo") + if !isFlagSet || err != nil { + isEnabled = false return } // 2. Check if Intel RDT "resource control" filesystem is mounted // The user guarantees to mount the filesystem - if !isIntelRdtMounted() { - return - } - - // 3. Double check if Intel RDT sub-features are available in - // "resource control" filesystem. Intel RDT sub-features can be - // selectively disabled or enabled by kernel command line - // (e.g., rdt=!l3cat,mba) in 4.14 and newer kernel - if isCatFlagSet { - if _, err := os.Stat(filepath.Join(intelRdtRoot, "info", "L3")); err == nil { - isCatEnabled = true - } - } - if isMbaScEnabled { - // We confirm MBA Software Controller is enabled in step 2, - // MBA should be enabled because MBA Software Controller - // depends on MBA - isMbaEnabled = true - } else if isMbaFlagSet { - if _, err := os.Stat(filepath.Join(intelRdtRoot, "info", "MB")); err == nil { - isMbaEnabled = true - } - } + isEnabled = isIntelRdtMounted() } // Return the mount point path of Intel RDT "resource control" filesysem @@ -247,16 +177,11 @@ func findIntelRdtMountpointDir() (string, error) { } if postSeparatorFields[0] == "resctrl" { - // Check that the mount is properly formatted. + // Check that the mount is properly formated. if numPostFields < 3 { return "", fmt.Errorf("Error found less than 3 fields post '-' in %q", text) } - // Check if MBA Software Controller is enabled through mount option "-o mba_MBps" - if strings.Contains(postSeparatorFields[2], "mba_MBps") { - isMbaScEnabled = true - } - return fields[4], nil } } @@ -298,40 +223,30 @@ func isIntelRdtMounted() bool { return true } -func parseCpuInfoFile(path string) (bool, bool, error) { - isCatFlagSet := false - isMbaFlagSet := false - +func parseCpuInfoFile(path string) (bool, error) { f, err := os.Open(path) if err != nil { - return false, false, err + return false, err } defer f.Close() s := bufio.NewScanner(f) for s.Scan() { if err := s.Err(); err != nil { - return false, false, err + return false, err } - line := s.Text() - - // Search "cat_l3" and "mba" flags in first "flags" line - if strings.Contains(line, "flags") { - flags := strings.Split(line, " ") - // "cat_l3" flag for CAT and "mba" flag for MBA - for _, flag := range flags { - switch flag { - case "cat_l3": - isCatFlagSet = true - case "mba": - isMbaFlagSet = true - } + text := s.Text() + flags := strings.Split(text, " ") + + // "cat_l3" flag is set if Intel RDT/CAT is supported + for _, flag := range flags { + if flag == "cat_l3" { + return true, nil } - return isCatFlagSet, isMbaFlagSet, nil } } - return isCatFlagSet, isMbaFlagSet, nil + return false, nil } func parseUint(s string, base, bitSize int) (uint64, error) { @@ -377,6 +292,30 @@ func getIntelRdtParamString(path, file string) (string, error) { return strings.TrimSpace(string(contents)), nil } +func readTasksFile(dir string) ([]int, error) { + f, err := os.Open(filepath.Join(dir, IntelRdtTasks)) + if err != nil { + return nil, err + } + defer f.Close() + + var ( + s = bufio.NewScanner(f) + out = []int{} + ) + + for s.Scan() { + if t := s.Text(); t != "" { + pid, err := strconv.Atoi(t) + if err != nil { + return nil, err + } + out = append(out, pid) + } + } + return out, nil +} + func writeFile(dir, file, data string) error { if dir == "" { return fmt.Errorf("no such directory for %s", file) @@ -429,64 +368,13 @@ func getL3CacheInfo() (*L3CacheInfo, error) { return l3CacheInfo, nil } -// Get the read-only memory bandwidth information -func getMemBwInfo() (*MemBwInfo, error) { - memBwInfo := &MemBwInfo{} - - rootPath, err := getIntelRdtRoot() - if err != nil { - return memBwInfo, err - } - - path := filepath.Join(rootPath, "info", "MB") - bandwidthGran, err := getIntelRdtParamUint(path, "bandwidth_gran") - if err != nil { - return memBwInfo, err - } - delayLinear, err := getIntelRdtParamUint(path, "delay_linear") - if err != nil { - return memBwInfo, err - } - minBandwidth, err := getIntelRdtParamUint(path, "min_bandwidth") - if err != nil { - return memBwInfo, err - } - numClosids, err := getIntelRdtParamUint(path, "num_closids") - if err != nil { - return memBwInfo, err - } - - memBwInfo.BandwidthGran = bandwidthGran - memBwInfo.DelayLinear = delayLinear - memBwInfo.MinBandwidth = minBandwidth - memBwInfo.NumClosids = numClosids - - return memBwInfo, nil -} - -// Get diagnostics for last filesystem operation error from file info/last_cmd_status -func getLastCmdStatus() (string, error) { - rootPath, err := getIntelRdtRoot() - if err != nil { - return "", err - } - - path := filepath.Join(rootPath, "info") - lastCmdStatus, err := getIntelRdtParamString(path, "last_cmd_status") - if err != nil { - return "", err - } - - return lastCmdStatus, nil -} - // WriteIntelRdtTasks writes the specified pid into the "tasks" file func WriteIntelRdtTasks(dir string, pid int) error { if dir == "" { return fmt.Errorf("no such directory for %s", IntelRdtTasks) } - // Don't attach any pid if -1 is specified as a pid + // Dont attach any pid if -1 is specified as a pid if pid != -1 { if err := ioutil.WriteFile(filepath.Join(dir, IntelRdtTasks), []byte(strconv.Itoa(pid)), 0700); err != nil { return fmt.Errorf("failed to write %v to %v: %v", pid, IntelRdtTasks, err) @@ -495,19 +383,9 @@ func WriteIntelRdtTasks(dir string, pid int) error { return nil } -// Check if Intel RDT/CAT is enabled -func IsCatEnabled() bool { - return isCatEnabled -} - -// Check if Intel RDT/MBA is enabled -func IsMbaEnabled() bool { - return isMbaEnabled -} - -// Check if Intel RDT/MBA Software Controller is enabled -func IsMbaScEnabled() bool { - return isMbaScEnabled +// Check if Intel RDT is enabled +func IsEnabled() bool { + return isEnabled } // Get the 'container_id' path in Intel RDT "resource control" filesystem @@ -547,7 +425,7 @@ func (m *IntelRdtManager) Apply(pid int) (err error) { func (m *IntelRdtManager) Destroy() error { m.mu.Lock() defer m.mu.Unlock() - if err := os.RemoveAll(m.GetPath()); err != nil { + if err := os.RemoveAll(m.Path); err != nil { return err } m.Path = "" @@ -574,143 +452,65 @@ func (m *IntelRdtManager) GetStats() (*Stats, error) { defer m.mu.Unlock() stats := NewStats() + // The read-only L3 cache information + l3CacheInfo, err := getL3CacheInfo() + if err != nil { + return nil, err + } + stats.L3CacheInfo = l3CacheInfo + + // The read-only L3 cache schema in root rootPath, err := getIntelRdtRoot() if err != nil { return nil, err } - // The read-only L3 cache and memory bandwidth schemata in root tmpRootStrings, err := getIntelRdtParamString(rootPath, "schemata") if err != nil { return nil, err } + // L3 cache schema is in the first line schemaRootStrings := strings.Split(tmpRootStrings, "\n") + stats.L3CacheSchemaRoot = schemaRootStrings[0] - // The L3 cache and memory bandwidth schemata in 'container_id' group + // The L3 cache schema in 'container_id' group tmpStrings, err := getIntelRdtParamString(m.GetPath(), "schemata") if err != nil { return nil, err } + // L3 cache schema is in the first line schemaStrings := strings.Split(tmpStrings, "\n") - - if IsCatEnabled() { - // The read-only L3 cache information - l3CacheInfo, err := getL3CacheInfo() - if err != nil { - return nil, err - } - stats.L3CacheInfo = l3CacheInfo - - // The read-only L3 cache schema in root - for _, schemaRoot := range schemaRootStrings { - if strings.Contains(schemaRoot, "L3") { - stats.L3CacheSchemaRoot = strings.TrimSpace(schemaRoot) - } - } - - // The L3 cache schema in 'container_id' group - for _, schema := range schemaStrings { - if strings.Contains(schema, "L3") { - stats.L3CacheSchema = strings.TrimSpace(schema) - } - } - } - - if IsMbaEnabled() { - // The read-only memory bandwidth information - memBwInfo, err := getMemBwInfo() - if err != nil { - return nil, err - } - stats.MemBwInfo = memBwInfo - - // The read-only memory bandwidth information - for _, schemaRoot := range schemaRootStrings { - if strings.Contains(schemaRoot, "MB") { - stats.MemBwSchemaRoot = strings.TrimSpace(schemaRoot) - } - } - - // The memory bandwidth schema in 'container_id' group - for _, schema := range schemaStrings { - if strings.Contains(schema, "MB") { - stats.MemBwSchema = strings.TrimSpace(schema) - } - } - } + stats.L3CacheSchema = schemaStrings[0] return stats, nil } // Set Intel RDT "resource control" filesystem as configured. func (m *IntelRdtManager) Set(container *configs.Config) error { - // About L3 cache schema: - // It has allocation bitmasks/values for L3 cache on each socket, + path := m.GetPath() + + // About L3 cache schema file: + // The schema has allocation masks/values for L3 cache on each socket, // which contains L3 cache id and capacity bitmask (CBM). - // Format: "L3:=;=;..." - // For example, on a two-socket machine, the schema line could be: - // L3:0=ff;1=c0 - // which means L3 cache id 0's CBM is 0xff, and L3 cache id 1's CBM - // is 0xc0. + // Format: "L3:=;=;..." + // For example, on a two-socket machine, L3's schema line could be: + // L3:0=ff;1=c0 + // Which means L3 cache id 0's CBM is 0xff, and L3 cache id 1's CBM is 0xc0. // + // About L3 cache CBM validity: // The valid L3 cache CBM is a *contiguous bits set* and number of // bits that can be set is less than the max bit. The max bits in the - // CBM is varied among supported Intel CPU models. Kernel will check - // if it is valid when writing. e.g., default value 0xfffff in root - // indicates the max bits of CBM is 20 bits, which mapping to entire - // L3 cache capacity. Some valid CBM values to set in a group: - // 0xf, 0xf0, 0x3ff, 0x1f00 and etc. - // - // - // About memory bandwidth schema: - // It has allocation values for memory bandwidth on each socket, which - // contains L3 cache id and memory bandwidth. - // Format: "MB:=bandwidth0;=bandwidth1;..." - // For example, on a two-socket machine, the schema line could be: - // "MB:0=20;1=70" - // - // The minimum bandwidth percentage value for each CPU model is - // predefined and can be looked up through "info/MB/min_bandwidth". - // The bandwidth granularity that is allocated is also dependent on - // the CPU model and can be looked up at "info/MB/bandwidth_gran". - // The available bandwidth control steps are: min_bw + N * bw_gran. - // Intermediate values are rounded to the next control step available - // on the hardware. - // - // If MBA Software Controller is enabled through mount option - // "-o mba_MBps": mount -t resctrl resctrl -o mba_MBps /sys/fs/resctrl - // We could specify memory bandwidth in "MBps" (Mega Bytes per second) - // unit instead of "percentages". The kernel underneath would use a - // software feedback mechanism or a "Software Controller" which reads - // the actual bandwidth using MBM counters and adjust the memory - // bandwidth percentages to ensure: - // "actual memory bandwidth < user specified memory bandwidth". - // - // For example, on a two-socket machine, the schema line could be - // "MB:0=5000;1=7000" which means 5000 MBps memory bandwidth limit on - // socket 0 and 7000 MBps memory bandwidth limit on socket 1. + // CBM is varied among supported Intel Xeon platforms. In Intel RDT + // "resource control" filesystem layout, the CBM in a group should + // be a subset of the CBM in root. Kernel will check if it is valid + // when writing. + // e.g., 0xfffff in root indicates the max bits of CBM is 20 bits, + // which mapping to entire L3 cache capacity. Some valid CBM values + // to set in a group: 0xf, 0xf0, 0x3ff, 0x1f00 and etc. if container.IntelRdt != nil { - path := m.GetPath() l3CacheSchema := container.IntelRdt.L3CacheSchema - memBwSchema := container.IntelRdt.MemBwSchema - - // Write a single joint schema string to schemata file - if l3CacheSchema != "" && memBwSchema != "" { - if err := writeFile(path, "schemata", l3CacheSchema+"\n"+memBwSchema); err != nil { - return NewLastCmdError(err) - } - } - - // Write only L3 cache schema string to schemata file - if l3CacheSchema != "" && memBwSchema == "" { + if l3CacheSchema != "" { if err := writeFile(path, "schemata", l3CacheSchema); err != nil { - return NewLastCmdError(err) - } - } - - // Write only memory bandwidth schema string to schemata file - if l3CacheSchema == "" && memBwSchema != "" { - if err := writeFile(path, "schemata", memBwSchema); err != nil { - return NewLastCmdError(err) + return err } } } @@ -721,11 +521,11 @@ func (m *IntelRdtManager) Set(container *configs.Config) error { func (raw *intelRdtData) join(id string) (string, error) { path := filepath.Join(raw.root, id) if err := os.MkdirAll(path, 0755); err != nil { - return "", NewLastCmdError(err) + return "", err } if err := WriteIntelRdtTasks(path, raw.pid); err != nil { - return "", NewLastCmdError(err) + return "", err } return path, nil } @@ -751,23 +551,3 @@ func IsNotFound(err error) bool { _, ok := err.(*NotFoundError) return ok } - -type LastCmdError struct { - LastCmdStatus string - Err error -} - -func (e *LastCmdError) Error() string { - return fmt.Sprintf(e.Err.Error() + ", last_cmd_status: " + e.LastCmdStatus) -} - -func NewLastCmdError(err error) error { - lastCmdStatus, err1 := getLastCmdStatus() - if err1 == nil { - return &LastCmdError{ - LastCmdStatus: lastCmdStatus, - Err: err, - } - } - return err -} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/intelrdt/stats.go b/vendor/github.com/opencontainers/runc/libcontainer/intelrdt/stats.go index df5686f3b8..095c0a380c 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/intelrdt/stats.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/intelrdt/stats.go @@ -8,13 +8,6 @@ type L3CacheInfo struct { NumClosids uint64 `json:"num_closids,omitempty"` } -type MemBwInfo struct { - BandwidthGran uint64 `json:"bandwidth_gran,omitempty"` - DelayLinear uint64 `json:"delay_linear,omitempty"` - MinBandwidth uint64 `json:"min_bandwidth,omitempty"` - NumClosids uint64 `json:"num_closids,omitempty"` -} - type Stats struct { // The read-only L3 cache information L3CacheInfo *L3CacheInfo `json:"l3_cache_info,omitempty"` @@ -24,15 +17,6 @@ type Stats struct { // The L3 cache schema in 'container_id' group L3CacheSchema string `json:"l3_cache_schema,omitempty"` - - // The read-only memory bandwidth information - MemBwInfo *MemBwInfo `json:"mem_bw_info,omitempty"` - - // The read-only memory bandwidth schema in root - MemBwSchemaRoot string `json:"mem_bw_schema_root,omitempty"` - - // The memory bandwidth schema in 'container_id' group - MemBwSchema string `json:"mem_bw_schema,omitempty"` } func NewStats() *Stats { diff --git a/vendor/github.com/opencontainers/runc/libcontainer/keys/keyctl.go b/vendor/github.com/opencontainers/runc/libcontainer/keys/keyctl.go index 74dedd56ca..ce8b4e6b04 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/keys/keyctl.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/keys/keyctl.go @@ -7,8 +7,6 @@ import ( "strconv" "strings" - "github.com/pkg/errors" - "golang.org/x/sys/unix" ) @@ -17,7 +15,7 @@ type KeySerial uint32 func JoinSessionKeyring(name string) (KeySerial, error) { sessKeyId, err := unix.KeyctlJoinSessionKeyring(name) if err != nil { - return 0, errors.Wrap(err, "create session key") + return 0, fmt.Errorf("could not create session key: %v", err) } return KeySerial(sessKeyId), nil } @@ -44,5 +42,9 @@ func ModKeyringPerm(ringId KeySerial, mask, setbits uint32) error { perm := (uint32(perm64) & mask) | setbits - return unix.KeyctlSetperm(int(ringId), perm) + if err := unix.KeyctlSetperm(int(ringId), perm); err != nil { + return err + } + + return nil } diff --git a/vendor/github.com/opencontainers/runc/libcontainer/message_linux.go b/vendor/github.com/opencontainers/runc/libcontainer/message_linux.go index 1d4f5033aa..ab453cde91 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/message_linux.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/message_linux.go @@ -10,16 +10,16 @@ import ( // list of known message types we want to send to bootstrap program // The number is randomly chosen to not conflict with known netlink types const ( - InitMsg uint16 = 62000 - CloneFlagsAttr uint16 = 27281 - NsPathsAttr uint16 = 27282 - UidmapAttr uint16 = 27283 - GidmapAttr uint16 = 27284 - SetgroupAttr uint16 = 27285 - OomScoreAdjAttr uint16 = 27286 - RootlessEUIDAttr uint16 = 27287 - UidmapPathAttr uint16 = 27288 - GidmapPathAttr uint16 = 27289 + InitMsg uint16 = 62000 + CloneFlagsAttr uint16 = 27281 + NsPathsAttr uint16 = 27282 + UidmapAttr uint16 = 27283 + GidmapAttr uint16 = 27284 + SetgroupAttr uint16 = 27285 + OomScoreAdjAttr uint16 = 27286 + RootlessAttr uint16 = 27287 + UidmapPathAttr uint16 = 27288 + GidmapPathAttr uint16 = 27289 ) type Int32msg struct { @@ -77,13 +77,13 @@ func (msg *Boolmsg) Serialize() []byte { native.PutUint16(buf[0:2], uint16(msg.Len())) native.PutUint16(buf[2:4], msg.Type) if msg.Value { - native.PutUint32(buf[4:8], uint32(1)) + buf[4] = 1 } else { - native.PutUint32(buf[4:8], uint32(0)) + buf[4] = 0 } return buf } func (msg *Boolmsg) Len() int { - return unix.NLA_HDRLEN + 4 // alignment + return unix.NLA_HDRLEN + 1 } diff --git a/vendor/github.com/opencontainers/runc/libcontainer/network_linux.go b/vendor/github.com/opencontainers/runc/libcontainer/network_linux.go index 569c53f6e8..5075bee4db 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/network_linux.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/network_linux.go @@ -5,15 +5,18 @@ package libcontainer import ( "fmt" "io/ioutil" + "net" "path/filepath" "strconv" "strings" "github.com/opencontainers/runc/libcontainer/configs" + "github.com/opencontainers/runc/libcontainer/utils" "github.com/vishvananda/netlink" ) var strategies = map[string]networkStrategy{ + "veth": &veth{}, "loopback": &loopback{}, } @@ -100,3 +103,157 @@ func (l *loopback) attach(n *configs.Network) (err error) { func (l *loopback) detach(n *configs.Network) (err error) { return nil } + +// veth is a network strategy that uses a bridge and creates +// a veth pair, one that is attached to the bridge on the host and the other +// is placed inside the container's namespace +type veth struct { +} + +func (v *veth) detach(n *configs.Network) (err error) { + return netlink.LinkSetMaster(&netlink.Device{LinkAttrs: netlink.LinkAttrs{Name: n.HostInterfaceName}}, nil) +} + +// attach a container network interface to an external network +func (v *veth) attach(n *configs.Network) (err error) { + brl, err := netlink.LinkByName(n.Bridge) + if err != nil { + return err + } + br, ok := brl.(*netlink.Bridge) + if !ok { + return fmt.Errorf("Wrong device type %T", brl) + } + host, err := netlink.LinkByName(n.HostInterfaceName) + if err != nil { + return err + } + + if err := netlink.LinkSetMaster(host, br); err != nil { + return err + } + if err := netlink.LinkSetMTU(host, n.Mtu); err != nil { + return err + } + if n.HairpinMode { + if err := netlink.LinkSetHairpin(host, true); err != nil { + return err + } + } + if err := netlink.LinkSetUp(host); err != nil { + return err + } + + return nil +} + +func (v *veth) create(n *network, nspid int) (err error) { + tmpName, err := v.generateTempPeerName() + if err != nil { + return err + } + n.TempVethPeerName = tmpName + if n.Bridge == "" { + return fmt.Errorf("bridge is not specified") + } + veth := &netlink.Veth{ + LinkAttrs: netlink.LinkAttrs{ + Name: n.HostInterfaceName, + TxQLen: n.TxQueueLen, + }, + PeerName: n.TempVethPeerName, + } + if err := netlink.LinkAdd(veth); err != nil { + return err + } + defer func() { + if err != nil { + netlink.LinkDel(veth) + } + }() + if err := v.attach(&n.Network); err != nil { + return err + } + child, err := netlink.LinkByName(n.TempVethPeerName) + if err != nil { + return err + } + return netlink.LinkSetNsPid(child, nspid) +} + +func (v *veth) generateTempPeerName() (string, error) { + return utils.GenerateRandomName("veth", 7) +} + +func (v *veth) initialize(config *network) error { + peer := config.TempVethPeerName + if peer == "" { + return fmt.Errorf("peer is not specified") + } + child, err := netlink.LinkByName(peer) + if err != nil { + return err + } + if err := netlink.LinkSetDown(child); err != nil { + return err + } + if err := netlink.LinkSetName(child, config.Name); err != nil { + return err + } + // get the interface again after we changed the name as the index also changes. + if child, err = netlink.LinkByName(config.Name); err != nil { + return err + } + if config.MacAddress != "" { + mac, err := net.ParseMAC(config.MacAddress) + if err != nil { + return err + } + if err := netlink.LinkSetHardwareAddr(child, mac); err != nil { + return err + } + } + ip, err := netlink.ParseAddr(config.Address) + if err != nil { + return err + } + if err := netlink.AddrAdd(child, ip); err != nil { + return err + } + if config.IPv6Address != "" { + ip6, err := netlink.ParseAddr(config.IPv6Address) + if err != nil { + return err + } + if err := netlink.AddrAdd(child, ip6); err != nil { + return err + } + } + if err := netlink.LinkSetMTU(child, config.Mtu); err != nil { + return err + } + if err := netlink.LinkSetUp(child); err != nil { + return err + } + if config.Gateway != "" { + gw := net.ParseIP(config.Gateway) + if err := netlink.RouteAdd(&netlink.Route{ + Scope: netlink.SCOPE_UNIVERSE, + LinkIndex: child.Attrs().Index, + Gw: gw, + }); err != nil { + return err + } + } + if config.IPv6Gateway != "" { + gw := net.ParseIP(config.IPv6Gateway) + if err := netlink.RouteAdd(&netlink.Route{ + Scope: netlink.SCOPE_UNIVERSE, + LinkIndex: child.Attrs().Index, + Gw: gw, + }); err != nil { + return err + } + } + return nil +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/nsenter/cloned_binary.c b/vendor/github.com/opencontainers/runc/libcontainer/nsenter/cloned_binary.c deleted file mode 100644 index ad10f14067..0000000000 --- a/vendor/github.com/opencontainers/runc/libcontainer/nsenter/cloned_binary.c +++ /dev/null @@ -1,516 +0,0 @@ -/* - * Copyright (C) 2019 Aleksa Sarai - * Copyright (C) 2019 SUSE LLC - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#define _GNU_SOURCE -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include - -/* Use our own wrapper for memfd_create. */ -#if !defined(SYS_memfd_create) && defined(__NR_memfd_create) -# define SYS_memfd_create __NR_memfd_create -#endif -/* memfd_create(2) flags -- copied from . */ -#ifndef MFD_CLOEXEC -# define MFD_CLOEXEC 0x0001U -# define MFD_ALLOW_SEALING 0x0002U -#endif -int memfd_create(const char *name, unsigned int flags) -{ -#ifdef SYS_memfd_create - return syscall(SYS_memfd_create, name, flags); -#else - errno = ENOSYS; - return -1; -#endif -} - - -/* This comes directly from . */ -#ifndef F_LINUX_SPECIFIC_BASE -# define F_LINUX_SPECIFIC_BASE 1024 -#endif -#ifndef F_ADD_SEALS -# define F_ADD_SEALS (F_LINUX_SPECIFIC_BASE + 9) -# define F_GET_SEALS (F_LINUX_SPECIFIC_BASE + 10) -#endif -#ifndef F_SEAL_SEAL -# define F_SEAL_SEAL 0x0001 /* prevent further seals from being set */ -# define F_SEAL_SHRINK 0x0002 /* prevent file from shrinking */ -# define F_SEAL_GROW 0x0004 /* prevent file from growing */ -# define F_SEAL_WRITE 0x0008 /* prevent writes */ -#endif - -#define CLONED_BINARY_ENV "_LIBCONTAINER_CLONED_BINARY" -#define RUNC_MEMFD_COMMENT "runc_cloned:/proc/self/exe" -#define RUNC_MEMFD_SEALS \ - (F_SEAL_SEAL | F_SEAL_SHRINK | F_SEAL_GROW | F_SEAL_WRITE) - -static void *must_realloc(void *ptr, size_t size) -{ - void *old = ptr; - do { - ptr = realloc(old, size); - } while(!ptr); - return ptr; -} - -/* - * Verify whether we are currently in a self-cloned program (namely, is - * /proc/self/exe a memfd). F_GET_SEALS will only succeed for memfds (or rather - * for shmem files), and we want to be sure it's actually sealed. - */ -static int is_self_cloned(void) -{ - int fd, ret, is_cloned = 0; - struct stat statbuf = {}; - struct statfs fsbuf = {}; - - fd = open("/proc/self/exe", O_RDONLY|O_CLOEXEC); - if (fd < 0) - return -ENOTRECOVERABLE; - - /* - * Is the binary a fully-sealed memfd? We don't need CLONED_BINARY_ENV for - * this, because you cannot write to a sealed memfd no matter what (so - * sharing it isn't a bad thing -- and an admin could bind-mount a sealed - * memfd to /usr/bin/runc to allow re-use). - */ - ret = fcntl(fd, F_GET_SEALS); - if (ret >= 0) { - is_cloned = (ret == RUNC_MEMFD_SEALS); - goto out; - } - - /* - * All other forms require CLONED_BINARY_ENV, since they are potentially - * writeable (or we can't tell if they're fully safe) and thus we must - * check the environment as an extra layer of defence. - */ - if (!getenv(CLONED_BINARY_ENV)) { - is_cloned = false; - goto out; - } - - /* - * Is the binary on a read-only filesystem? We can't detect bind-mounts in - * particular (in-kernel they are identical to regular mounts) but we can - * at least be sure that it's read-only. In addition, to make sure that - * it's *our* bind-mount we check CLONED_BINARY_ENV. - */ - if (fstatfs(fd, &fsbuf) >= 0) - is_cloned |= (fsbuf.f_flags & MS_RDONLY); - - /* - * Okay, we're a tmpfile -- or we're currently running on RHEL <=7.6 - * which appears to have a borked backport of F_GET_SEALS. Either way, - * having a file which has no hardlinks indicates that we aren't using - * a host-side "runc" binary and this is something that a container - * cannot fake (because unlinking requires being able to resolve the - * path that you want to unlink). - */ - if (fstat(fd, &statbuf) >= 0) - is_cloned |= (statbuf.st_nlink == 0); - -out: - close(fd); - return is_cloned; -} - -/* Read a given file into a new buffer, and providing the length. */ -static char *read_file(char *path, size_t *length) -{ - int fd; - char buf[4096], *copy = NULL; - - if (!length) - return NULL; - - fd = open(path, O_RDONLY | O_CLOEXEC); - if (fd < 0) - return NULL; - - *length = 0; - for (;;) { - ssize_t n; - - n = read(fd, buf, sizeof(buf)); - if (n < 0) - goto error; - if (!n) - break; - - copy = must_realloc(copy, (*length + n) * sizeof(*copy)); - memcpy(copy + *length, buf, n); - *length += n; - } - close(fd); - return copy; - -error: - close(fd); - free(copy); - return NULL; -} - -/* - * A poor-man's version of "xargs -0". Basically parses a given block of - * NUL-delimited data, within the given length and adds a pointer to each entry - * to the array of pointers. - */ -static int parse_xargs(char *data, int data_length, char ***output) -{ - int num = 0; - char *cur = data; - - if (!data || *output != NULL) - return -1; - - while (cur < data + data_length) { - num++; - *output = must_realloc(*output, (num + 1) * sizeof(**output)); - (*output)[num - 1] = cur; - cur += strlen(cur) + 1; - } - (*output)[num] = NULL; - return num; -} - -/* - * "Parse" out argv from /proc/self/cmdline. - * This is necessary because we are running in a context where we don't have a - * main() that we can just get the arguments from. - */ -static int fetchve(char ***argv) -{ - char *cmdline = NULL; - size_t cmdline_size; - - cmdline = read_file("/proc/self/cmdline", &cmdline_size); - if (!cmdline) - goto error; - - if (parse_xargs(cmdline, cmdline_size, argv) <= 0) - goto error; - - return 0; - -error: - free(cmdline); - return -EINVAL; -} - -enum { - EFD_NONE = 0, - EFD_MEMFD, - EFD_FILE, -}; - -/* - * This comes from . We can't hard-code __O_TMPFILE because it - * changes depending on the architecture. If we don't have O_TMPFILE we always - * have the mkostemp(3) fallback. - */ -#ifndef O_TMPFILE -# if defined(__O_TMPFILE) && defined(O_DIRECTORY) -# define O_TMPFILE (__O_TMPFILE | O_DIRECTORY) -# endif -#endif - -static int make_execfd(int *fdtype) -{ - int fd = -1; - char template[PATH_MAX] = {0}; - char *prefix = getenv("_LIBCONTAINER_STATEDIR"); - - if (!prefix || *prefix != '/') - prefix = "/tmp"; - if (snprintf(template, sizeof(template), "%s/runc.XXXXXX", prefix) < 0) - return -1; - - /* - * Now try memfd, it's much nicer than actually creating a file in STATEDIR - * since it's easily detected thanks to sealing and also doesn't require - * assumptions about STATEDIR. - */ - *fdtype = EFD_MEMFD; - fd = memfd_create(RUNC_MEMFD_COMMENT, MFD_CLOEXEC | MFD_ALLOW_SEALING); - if (fd >= 0) - return fd; - if (errno != ENOSYS && errno != EINVAL) - goto error; - -#ifdef O_TMPFILE - /* - * Try O_TMPFILE to avoid races where someone might snatch our file. Note - * that O_EXCL isn't actually a security measure here (since you can just - * fd re-open it and clear O_EXCL). - */ - *fdtype = EFD_FILE; - fd = open(prefix, O_TMPFILE | O_EXCL | O_RDWR | O_CLOEXEC, 0700); - if (fd >= 0) { - struct stat statbuf = {}; - bool working_otmpfile = false; - - /* - * open(2) ignores unknown O_* flags -- yeah, I was surprised when I - * found this out too. As a result we can't check for EINVAL. However, - * if we get nlink != 0 (or EISDIR) then we know that this kernel - * doesn't support O_TMPFILE. - */ - if (fstat(fd, &statbuf) >= 0) - working_otmpfile = (statbuf.st_nlink == 0); - - if (working_otmpfile) - return fd; - - /* Pretend that we got EISDIR since O_TMPFILE failed. */ - close(fd); - errno = EISDIR; - } - if (errno != EISDIR) - goto error; -#endif /* defined(O_TMPFILE) */ - - /* - * Our final option is to create a temporary file the old-school way, and - * then unlink it so that nothing else sees it by accident. - */ - *fdtype = EFD_FILE; - fd = mkostemp(template, O_CLOEXEC); - if (fd >= 0) { - if (unlink(template) >= 0) - return fd; - close(fd); - } - -error: - *fdtype = EFD_NONE; - return -1; -} - -static int seal_execfd(int *fd, int fdtype) -{ - switch (fdtype) { - case EFD_MEMFD: - return fcntl(*fd, F_ADD_SEALS, RUNC_MEMFD_SEALS); - case EFD_FILE: { - /* Need to re-open our pseudo-memfd as an O_PATH to avoid execve(2) giving -ETXTBSY. */ - int newfd; - char fdpath[PATH_MAX] = {0}; - - if (fchmod(*fd, 0100) < 0) - return -1; - - if (snprintf(fdpath, sizeof(fdpath), "/proc/self/fd/%d", *fd) < 0) - return -1; - - newfd = open(fdpath, O_PATH | O_CLOEXEC); - if (newfd < 0) - return -1; - - close(*fd); - *fd = newfd; - return 0; - } - default: - break; - } - return -1; -} - -static int try_bindfd(void) -{ - int fd, ret = -1; - char template[PATH_MAX] = {0}; - char *prefix = getenv("_LIBCONTAINER_STATEDIR"); - - if (!prefix || *prefix != '/') - prefix = "/tmp"; - if (snprintf(template, sizeof(template), "%s/runc.XXXXXX", prefix) < 0) - return ret; - - /* - * We need somewhere to mount it, mounting anything over /proc/self is a - * BAD idea on the host -- even if we do it temporarily. - */ - fd = mkstemp(template); - if (fd < 0) - return ret; - close(fd); - - /* - * For obvious reasons this won't work in rootless mode because we haven't - * created a userns+mntns -- but getting that to work will be a bit - * complicated and it's only worth doing if someone actually needs it. - */ - ret = -EPERM; - if (mount("/proc/self/exe", template, "", MS_BIND, "") < 0) - goto out; - if (mount("", template, "", MS_REMOUNT | MS_BIND | MS_RDONLY, "") < 0) - goto out_umount; - - - /* Get read-only handle that we're sure can't be made read-write. */ - ret = open(template, O_PATH | O_CLOEXEC); - -out_umount: - /* - * Make sure the MNT_DETACH works, otherwise we could get remounted - * read-write and that would be quite bad (the fd would be made read-write - * too, invalidating the protection). - */ - if (umount2(template, MNT_DETACH) < 0) { - if (ret >= 0) - close(ret); - ret = -ENOTRECOVERABLE; - } - -out: - /* - * We don't care about unlink errors, the worst that happens is that - * there's an empty file left around in STATEDIR. - */ - unlink(template); - return ret; -} - -static ssize_t fd_to_fd(int outfd, int infd) -{ - ssize_t total = 0; - char buffer[4096]; - - for (;;) { - ssize_t nread, nwritten = 0; - - nread = read(infd, buffer, sizeof(buffer)); - if (nread < 0) - return -1; - if (!nread) - break; - - do { - ssize_t n = write(outfd, buffer + nwritten, nread - nwritten); - if (n < 0) - return -1; - nwritten += n; - } while(nwritten < nread); - - total += nwritten; - } - - return total; -} - -static int clone_binary(void) -{ - int binfd, execfd; - struct stat statbuf = {}; - size_t sent = 0; - int fdtype = EFD_NONE; - - /* - * Before we resort to copying, let's try creating an ro-binfd in one shot - * by getting a handle for a read-only bind-mount of the execfd. - */ - execfd = try_bindfd(); - if (execfd >= 0) - return execfd; - - /* - * Dammit, that didn't work -- time to copy the binary to a safe place we - * can seal the contents. - */ - execfd = make_execfd(&fdtype); - if (execfd < 0 || fdtype == EFD_NONE) - return -ENOTRECOVERABLE; - - binfd = open("/proc/self/exe", O_RDONLY | O_CLOEXEC); - if (binfd < 0) - goto error; - - if (fstat(binfd, &statbuf) < 0) - goto error_binfd; - - while (sent < statbuf.st_size) { - int n = sendfile(execfd, binfd, NULL, statbuf.st_size - sent); - if (n < 0) { - /* sendfile can fail so we fallback to a dumb user-space copy. */ - n = fd_to_fd(execfd, binfd); - if (n < 0) - goto error_binfd; - } - sent += n; - } - close(binfd); - if (sent != statbuf.st_size) - goto error; - - if (seal_execfd(&execfd, fdtype) < 0) - goto error; - - return execfd; - -error_binfd: - close(binfd); -error: - close(execfd); - return -EIO; -} - -/* Get cheap access to the environment. */ -extern char **environ; - -int ensure_cloned_binary(void) -{ - int execfd; - char **argv = NULL; - - /* Check that we're not self-cloned, and if we are then bail. */ - int cloned = is_self_cloned(); - if (cloned > 0 || cloned == -ENOTRECOVERABLE) - return cloned; - - if (fetchve(&argv) < 0) - return -EINVAL; - - execfd = clone_binary(); - if (execfd < 0) - return -EIO; - - if (putenv(CLONED_BINARY_ENV "=1")) - goto error; - - fexecve(execfd, argv, environ); -error: - close(execfd); - return -ENOEXEC; -} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/nsenter/nsexec.c b/vendor/github.com/opencontainers/runc/libcontainer/nsenter/nsexec.c index 7750af35ea..2c69cee5d6 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/nsenter/nsexec.c +++ b/vendor/github.com/opencontainers/runc/libcontainer/nsenter/nsexec.c @@ -42,12 +42,6 @@ enum sync_t { SYNC_ERR = 0xFF, /* Fatal error, no turning back. The error code follows. */ }; -/* - * Synchronisation value for cgroup namespace setup. - * The same constant is defined in process_linux.go as "createCgroupns". - */ -#define CREATECGROUPNS 0x80 - /* longjmp() arguments. */ #define JUMP_PARENT 0x00 #define JUMP_CHILD 0xA0 @@ -88,7 +82,7 @@ struct nlconfig_t { uint8_t is_setgroup; /* Rootless container settings. */ - uint8_t is_rootless_euid; /* boolean */ + uint8_t is_rootless; char *uidmappath; size_t uidmappath_len; char *gidmappath; @@ -106,7 +100,7 @@ struct nlconfig_t { #define GIDMAP_ATTR 27284 #define SETGROUP_ATTR 27285 #define OOM_SCORE_ADJ_ATTR 27286 -#define ROOTLESS_EUID_ATTR 27287 +#define ROOTLESS_ATTR 27287 #define UIDMAPPATH_ATTR 27288 #define GIDMAPPATH_ATTR 27289 @@ -217,7 +211,7 @@ static int try_mapping_tool(const char *app, int pid, char *map, size_t map_len) /* * If @app is NULL, execve will segfault. Just check it here and bail (if - * we're in this path, the caller is already getting desperate and there + * we're in this path, the caller is already getting desparate and there * isn't a backup to this failing). This usually would be a configuration * or programming issue. */ @@ -425,8 +419,8 @@ static void nl_parse(int fd, struct nlconfig_t *config) case CLONE_FLAGS_ATTR: config->cloneflags = readint32(current); break; - case ROOTLESS_EUID_ATTR: - config->is_rootless_euid = readint8(current); /* boolean */ + case ROOTLESS_ATTR: + config->is_rootless = readint8(current); break; case OOM_SCORE_ADJ_ATTR: config->oom_score_adj = current; @@ -511,8 +505,7 @@ void join_namespaces(char *nslist) ns->fd = fd; ns->ns = nsflag(namespace); - strncpy(ns->path, path, PATH_MAX - 1); - ns->path[PATH_MAX - 1] = '\0'; + strncpy(ns->path, path, PATH_MAX); } while ((namespace = strtok_r(NULL, ",", &saveptr)) != NULL); /* @@ -534,9 +527,6 @@ void join_namespaces(char *nslist) free(namespaces); } -/* Defined in cloned_binary.c. */ -extern int ensure_cloned_binary(void); - void nsexec(void) { int pipenum; @@ -552,14 +542,6 @@ void nsexec(void) if (pipenum == -1) return; - /* - * We need to re-exec if we are not in a cloned binary. This is necessary - * to ensure that containers won't be able to access the host binary - * through /proc/self/exe. See CVE-2019-5736. - */ - if (ensure_cloned_binary() < 0) - bail("could not ensure we are a cloned binary"); - /* Parse all of the netlink configuration. */ nl_parse(pipenum, &config); @@ -657,6 +639,7 @@ void nsexec(void) case JUMP_PARENT:{ int len; pid_t child, first_child = -1; + char buf[JSON_MAX]; bool ready = false; /* For debugging. */ @@ -695,15 +678,17 @@ void nsexec(void) /* * Enable setgroups(2) if we've been asked to. But we also * have to explicitly disable setgroups(2) if we're - * creating a rootless container for single-entry mapping. - * i.e. config.is_setgroup == false. - * (this is required since Linux 3.19). - * - * For rootless multi-entry mapping, config.is_setgroup shall be true and - * newuidmap/newgidmap shall be used. + * creating a rootless container (this is required since + * Linux 3.19). */ + if (config.is_rootless && config.is_setgroup) { + kill(child, SIGKILL); + bail("cannot allow setgroup in an unprivileged user namespace setup"); + } - if (config.is_rootless_euid && !config.is_setgroup) + if (config.is_setgroup) + update_setgroups(child, SETGROUPS_ALLOW); + if (config.is_rootless) update_setgroups(child, SETGROUPS_DENY); /* Set up mappings. */ @@ -732,18 +717,6 @@ void nsexec(void) kill(child, SIGKILL); bail("failed to sync with child: write(SYNC_RECVPID_ACK)"); } - - /* Send the init_func pid back to our parent. - * - * Send the init_func pid and the pid of the first child back to our parent. - * We need to send both back because we can't reap the first child we created (CLONE_PARENT). - * It becomes the responsibility of our parent to reap the first child. - */ - len = dprintf(pipenum, "{\"pid\": %d, \"pid_first\": %d}\n", child, first_child); - if (len < 0) { - kill(child, SIGKILL); - bail("unable to generate JSON for child pid"); - } } break; case SYNC_CHILD_READY: @@ -787,6 +760,23 @@ void nsexec(void) bail("unexpected sync value: %u", s); } } + + /* + * Send the init_func pid and the pid of the first child back to our parent. + * + * We need to send both back because we can't reap the first child we created (CLONE_PARENT). + * It becomes the responsibility of our parent to reap the first child. + */ + len = snprintf(buf, JSON_MAX, "{\"pid\": %d, \"pid_first\": %d}\n", child, first_child); + if (len < 0) { + kill(child, SIGKILL); + bail("unable to generate JSON for child pid"); + } + if (write(pipenum, buf, len) != len) { + kill(child, SIGKILL); + bail("unable to send child pid to bootstrapper"); + } + exit(0); } @@ -819,30 +809,25 @@ void nsexec(void) if (config.namespaces) join_namespaces(config.namespaces); + /* + * Unshare all of the namespaces. Now, it should be noted that this + * ordering might break in the future (especially with rootless + * containers). But for now, it's not possible to split this into + * CLONE_NEWUSER + [the rest] because of some RHEL SELinux issues. + * + * Note that we don't merge this with clone() because there were + * some old kernel versions where clone(CLONE_PARENT | CLONE_NEWPID) + * was broken, so we'll just do it the long way anyway. + */ + if (unshare(config.cloneflags) < 0) + bail("failed to unshare namespaces"); + /* * Deal with user namespaces first. They are quite special, as they * affect our ability to unshare other namespaces and are used as * context for privilege checks. - * - * We don't unshare all namespaces in one go. The reason for this - * is that, while the kernel documentation may claim otherwise, - * there are certain cases where unsharing all namespaces at once - * will result in namespace objects being owned incorrectly. - * Ideally we should just fix these kernel bugs, but it's better to - * be safe than sorry, and fix them separately. - * - * A specific case of this is that the SELinux label of the - * internal kern-mount that mqueue uses will be incorrect if the - * UTS namespace is cloned before the USER namespace is mapped. - * I've also heard of similar problems with the network namespace - * in some scenarios. This also mirrors how LXC deals with this - * problem. */ if (config.cloneflags & CLONE_NEWUSER) { - if (unshare(CLONE_NEWUSER) < 0) - bail("failed to unshare user namespace"); - config.cloneflags &= ~CLONE_NEWUSER; - /* * We don't have the privileges to do any mapping here (see the * clone_parent rant). So signal our parent to hook us up. @@ -868,23 +853,7 @@ void nsexec(void) if (prctl(PR_SET_DUMPABLE, 0, 0, 0, 0) < 0) bail("failed to set process as dumpable"); } - - /* Become root in the namespace proper. */ - if (setresuid(0, 0, 0) < 0) - bail("failed to become root in user namespace"); } - /* - * Unshare all of the namespaces. Now, it should be noted that this - * ordering might break in the future (especially with rootless - * containers). But for now, it's not possible to split this into - * CLONE_NEWUSER + [the rest] because of some RHEL SELinux issues. - * - * Note that we don't merge this with clone() because there were - * some old kernel versions where clone(CLONE_PARENT | CLONE_NEWPID) - * was broken, so we'll just do it the long way anyway. - */ - if (unshare(config.cloneflags & ~CLONE_NEWCGROUP) < 0) - bail("failed to unshare namespaces"); /* * TODO: What about non-namespace clone flags that we're dropping here? @@ -967,23 +936,11 @@ void nsexec(void) if (setgid(0) < 0) bail("setgid failed"); - if (!config.is_rootless_euid && config.is_setgroup) { + if (!config.is_rootless && config.is_setgroup) { if (setgroups(0, NULL) < 0) bail("setgroups failed"); } - /* ... wait until our topmost parent has finished cgroup setup in p.manager.Apply() ... */ - if (config.cloneflags & CLONE_NEWCGROUP) { - uint8_t value; - if (read(pipenum, &value, sizeof(value)) != sizeof(value)) - bail("read synchronisation value failed"); - if (value == CREATECGROUPNS) { - if (unshare(CLONE_NEWCGROUP) < 0) - bail("failed to unshare cgroup namespace"); - } else - bail("received unknown synchronisation value"); - } - s = SYNC_CHILD_READY; if (write(syncfd, &s, sizeof(s)) != sizeof(s)) bail("failed to sync with patent: write(SYNC_CHILD_READY)"); diff --git a/vendor/github.com/opencontainers/runc/libcontainer/process.go b/vendor/github.com/opencontainers/runc/libcontainer/process.go index 9a7c601412..86bf7387f8 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/process.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/process.go @@ -72,9 +72,6 @@ type Process struct { // ConsoleSocket provides the masterfd console. ConsoleSocket *os.File - // Init specifies whether the process is the first process in the container. - Init bool - ops processOperations } diff --git a/vendor/github.com/opencontainers/runc/libcontainer/process_linux.go b/vendor/github.com/opencontainers/runc/libcontainer/process_linux.go index e8ffac9fa5..58980b0594 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/process_linux.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/process_linux.go @@ -22,10 +22,6 @@ import ( "golang.org/x/sys/unix" ) -// Synchronisation value for cgroup namespace setup. -// The same constant is defined in nsexec.c as "CREATECGROUPNS". -const createCgroupns = 0x80 - type parentProcess interface { // pid returns the pid for the running process. pid() int @@ -50,16 +46,15 @@ type parentProcess interface { } type setnsProcess struct { - cmd *exec.Cmd - parentPipe *os.File - childPipe *os.File - cgroupPaths map[string]string - rootlessCgroups bool - intelRdtPath string - config *initConfig - fds []string - process *Process - bootstrapData io.Reader + cmd *exec.Cmd + parentPipe *os.File + childPipe *os.File + cgroupPaths map[string]string + intelRdtPath string + config *initConfig + fds []string + process *Process + bootstrapData io.Reader } func (p *setnsProcess) startTime() (uint64, error) { @@ -91,7 +86,7 @@ func (p *setnsProcess) start() (err error) { return newSystemErrorWithCause(err, "executing setns process") } if len(p.cgroupPaths) > 0 { - if err := cgroups.EnterPid(p.cgroupPaths, p.pid()); err != nil && !p.rootlessCgroups { + if err := cgroups.EnterPid(p.cgroupPaths, p.pid()); err != nil { return newSystemErrorWithCausef(err, "adding pid %d to cgroups", p.pid()) } } @@ -229,17 +224,12 @@ func (p *initProcess) externalDescriptors() []string { return p.fds } -// getChildPid receives the final child's pid over the provided pipe. -func (p *initProcess) getChildPid() (int, error) { - var pid pid - if err := json.NewDecoder(p.parentPipe).Decode(&pid); err != nil { - p.cmd.Wait() - return -1, err - } - return pid.Pid, nil -} - -func (p *initProcess) waitForChildExit(childPid int) error { +// execSetns runs the process that executes C code to perform the setns calls +// because setns support requires the C process to fork off a child and perform the setns +// before the go runtime boots, we wait on the process to die and receive the child's pid +// over the provided pipe. +// This is called by initProcess.start function +func (p *initProcess) execSetns() error { status, err := p.cmd.Process.Wait() if err != nil { p.cmd.Wait() @@ -249,8 +239,22 @@ func (p *initProcess) waitForChildExit(childPid int) error { p.cmd.Wait() return &exec.ExitError{ProcessState: status} } + var pid *pid + if err := json.NewDecoder(p.parentPipe).Decode(&pid); err != nil { + p.cmd.Wait() + return err + } - process, err := os.FindProcess(childPid) + // Clean up the zombie parent process + firstChildProcess, err := os.FindProcess(pid.PidFirstChild) + if err != nil { + return err + } + + // Ignore the error in case the child has already been reaped for any reason + _, _ = firstChildProcess.Wait() + + process, err := os.FindProcess(pid.Pid) if err != nil { return err } @@ -292,47 +296,19 @@ func (p *initProcess) start() error { if _, err := io.Copy(p.parentPipe, p.bootstrapData); err != nil { return newSystemErrorWithCause(err, "copying bootstrap data to pipe") } - childPid, err := p.getChildPid() - if err != nil { - return newSystemErrorWithCause(err, "getting the final child's pid from pipe") + + if err := p.execSetns(); err != nil { + return newSystemErrorWithCause(err, "running exec setns process for init") } // Save the standard descriptor names before the container process // can potentially move them (e.g., via dup2()). If we don't do this now, // we won't know at checkpoint time which file descriptor to look up. - fds, err := getPipeFds(childPid) + fds, err := getPipeFds(p.pid()) if err != nil { - return newSystemErrorWithCausef(err, "getting pipe fds for pid %d", childPid) + return newSystemErrorWithCausef(err, "getting pipe fds for pid %d", p.pid()) } p.setExternalDescriptors(fds) - // Do this before syncing with child so that no children - // can escape the cgroup - if err := p.manager.Apply(childPid); err != nil { - return newSystemErrorWithCause(err, "applying cgroup configuration for process") - } - if p.intelRdtManager != nil { - if err := p.intelRdtManager.Apply(childPid); err != nil { - return newSystemErrorWithCause(err, "applying Intel RDT configuration for process") - } - } - // Now it's time to setup cgroup namesapce - if p.config.Config.Namespaces.Contains(configs.NEWCGROUP) && p.config.Config.Namespaces.PathOf(configs.NEWCGROUP) == "" { - if _, err := p.parentPipe.Write([]byte{createCgroupns}); err != nil { - return newSystemErrorWithCause(err, "sending synchronization value to init process") - } - } - - // Wait for our first child to exit - if err := p.waitForChildExit(childPid); err != nil { - return newSystemErrorWithCause(err, "waiting for our first child to exit") - } - - defer func() { - if err != nil { - // TODO: should not be the responsibility to call here - p.manager.Destroy() - } - }() if err := p.createNetworkInterfaces(); err != nil { return newSystemErrorWithCause(err, "creating network interfaces") } @@ -365,13 +341,14 @@ func (p *initProcess) start() error { } if p.config.Config.Hooks != nil { - s, err := p.container.currentOCIState() - if err != nil { - return err + bundle, annotations := utils.Annotations(p.container.config.Labels) + s := configs.HookState{ + Version: p.container.config.Version, + ID: p.container.id, + Pid: p.pid(), + Bundle: bundle, + Annotations: annotations, } - // initProcessStartTime hasn't been set yet. - s.Pid = p.cmd.Process.Pid - s.Status = "creating" for i, hook := range p.config.Config.Hooks.Prestart { if err := hook.Run(s); err != nil { return newSystemErrorWithCausef(err, "running prestart hook %d", i) @@ -395,13 +372,14 @@ func (p *initProcess) start() error { } } if p.config.Config.Hooks != nil { - s, err := p.container.currentOCIState() - if err != nil { - return err + bundle, annotations := utils.Annotations(p.container.config.Labels) + s := configs.HookState{ + Version: p.container.config.Version, + ID: p.container.id, + Pid: p.pid(), + Bundle: bundle, + Annotations: annotations, } - // initProcessStartTime hasn't been set yet. - s.Pid = p.cmd.Process.Pid - s.Status = "creating" for i, hook := range p.config.Config.Hooks.Prestart { if err := hook.Run(s); err != nil { return newSystemErrorWithCausef(err, "running prestart hook %d", i) @@ -559,7 +537,7 @@ func (p *Process) InitializeIO(rootuid, rootgid int) (i *IO, err error) { } fds = append(fds, r.Fd(), w.Fd()) p.Stderr, i.Stderr = w, r - // change ownership of the pipes in case we are in a user namespace + // change ownership of the pipes incase we are in a user namespace for _, fd := range fds { if err := unix.Fchown(int(fd), rootuid, rootgid); err != nil { return nil, err diff --git a/vendor/github.com/opencontainers/runc/libcontainer/rootfs_linux.go b/vendor/github.com/opencontainers/runc/libcontainer/rootfs_linux.go index f13b226e44..cf715d6649 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/rootfs_linux.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/rootfs_linux.go @@ -46,7 +46,6 @@ func prepareRootfs(pipe io.ReadWriter, iConfig *initConfig) (err error) { return newSystemErrorWithCause(err, "preparing rootfs") } - hasCgroupns := config.Namespaces.Contains(configs.NEWCGROUP) setupDev := needsSetupDev(config) for _, m := range config.Mounts { for _, precmd := range m.PremountCmds { @@ -54,7 +53,8 @@ func prepareRootfs(pipe io.ReadWriter, iConfig *initConfig) (err error) { return newSystemErrorWithCause(err, "running premount command") } } - if err := mountToRootfs(m, config.Rootfs, config.MountLabel, hasCgroupns); err != nil { + + if err := mountToRootfs(m, config.Rootfs, config.MountLabel); err != nil { return newSystemErrorWithCausef(err, "mounting %q to rootfs %q at %q", m.Source, config.Rootfs, m.Destination) } @@ -152,26 +152,6 @@ func finalizeRootfs(config *configs.Config) (err error) { return nil } -// /tmp has to be mounted as private to allow MS_MOVE to work in all situations -func prepareTmp(topTmpDir string) (string, error) { - tmpdir, err := ioutil.TempDir(topTmpDir, "runctop") - if err != nil { - return "", err - } - if err := unix.Mount(tmpdir, tmpdir, "bind", unix.MS_BIND, ""); err != nil { - return "", err - } - if err := unix.Mount("", tmpdir, "", uintptr(unix.MS_PRIVATE), ""); err != nil { - return "", err - } - return tmpdir, nil -} - -func cleanupTmp(tmpdir string) error { - unix.Unmount(tmpdir, 0) - return os.RemoveAll(tmpdir) -} - func mountCmd(cmd configs.Command) error { command := exec.Command(cmd.Path, cmd.Args[:]...) command.Env = cmd.Env @@ -182,34 +162,7 @@ func mountCmd(cmd configs.Command) error { return nil } -func prepareBindMount(m *configs.Mount, rootfs string) error { - stat, err := os.Stat(m.Source) - if err != nil { - // error out if the source of a bind mount does not exist as we will be - // unable to bind anything to it. - return err - } - // ensure that the destination of the bind mount is resolved of symlinks at mount time because - // any previous mounts can invalidate the next mount's destination. - // this can happen when a user specifies mounts within other mounts to cause breakouts or other - // evil stuff to try to escape the container's rootfs. - var dest string - if dest, err = securejoin.SecureJoin(rootfs, m.Destination); err != nil { - return err - } - if err := checkMountDestination(rootfs, dest); err != nil { - return err - } - // update the mount with the correct dest after symlinks are resolved. - m.Destination = dest - if err := createIfNotExists(dest, stat.IsDir()); err != nil { - return err - } - - return nil -} - -func mountToRootfs(m *configs.Mount, rootfs, mountLabel string, enableCgroupns bool) error { +func mountToRootfs(m *configs.Mount, rootfs, mountLabel string) error { var ( dest = m.Destination ) @@ -246,12 +199,7 @@ func mountToRootfs(m *configs.Mount, rootfs, mountLabel string, enableCgroupns b } } if copyUp { - tmpdir, err := prepareTmp("/tmp") - if err != nil { - return newSystemErrorWithCause(err, "tmpcopyup: failed to setup tmpdir") - } - defer cleanupTmp(tmpdir) - tmpDir, err = ioutil.TempDir(tmpdir, "runctmpdir") + tmpDir, err = ioutil.TempDir("/tmp", "runctmpdir") if err != nil { return newSystemErrorWithCause(err, "tmpcopyup: failed to create tmpdir") } @@ -284,7 +232,25 @@ func mountToRootfs(m *configs.Mount, rootfs, mountLabel string, enableCgroupns b } return nil case "bind": - if err := prepareBindMount(m, rootfs); err != nil { + stat, err := os.Stat(m.Source) + if err != nil { + // error out if the source of a bind mount does not exist as we will be + // unable to bind anything to it. + return err + } + // ensure that the destination of the bind mount is resolved of symlinks at mount time because + // any previous mounts can invalidate the next mount's destination. + // this can happen when a user specifies mounts within other mounts to cause breakouts or other + // evil stuff to try to escape the container's rootfs. + if dest, err = securejoin.SecureJoin(rootfs, m.Destination); err != nil { + return err + } + if err := checkMountDestination(rootfs, dest); err != nil { + return err + } + // update the mount with the correct dest after symlinks are resolved. + m.Destination = dest + if err := createIfNotExists(dest, stat.IsDir()); err != nil { return err } if err := mountPropagate(m, rootfs, mountLabel); err != nil { @@ -328,33 +294,12 @@ func mountToRootfs(m *configs.Mount, rootfs, mountLabel string, enableCgroupns b Data: "mode=755", PropagationFlags: m.PropagationFlags, } - if err := mountToRootfs(tmpfs, rootfs, mountLabel, enableCgroupns); err != nil { + if err := mountToRootfs(tmpfs, rootfs, mountLabel); err != nil { return err } for _, b := range binds { - if enableCgroupns { - subsystemPath := filepath.Join(rootfs, b.Destination) - if err := os.MkdirAll(subsystemPath, 0755); err != nil { - return err - } - flags := defaultMountFlags - if m.Flags&unix.MS_RDONLY != 0 { - flags = flags | unix.MS_RDONLY - } - cgroupmount := &configs.Mount{ - Source: "cgroup", - Device: "cgroup", - Destination: subsystemPath, - Flags: flags, - Data: filepath.Base(subsystemPath), - } - if err := mountNewCgroup(cgroupmount); err != nil { - return err - } - } else { - if err := mountToRootfs(b, rootfs, mountLabel, enableCgroupns); err != nil { - return err - } + if err := mountToRootfs(b, rootfs, mountLabel); err != nil { + return err } } for _, mc := range merged { @@ -451,7 +396,6 @@ func checkMountDestination(rootfs, dest string) error { "/proc/stat", "/proc/swaps", "/proc/uptime", - "/proc/loadavg", "/proc/net/dev", } for _, valid := range validDestinations { @@ -468,7 +412,7 @@ func checkMountDestination(rootfs, dest string) error { if err != nil { return err } - if path != "." && !strings.HasPrefix(path, "..") { + if path == "." || !strings.HasPrefix(path, "..") { return fmt.Errorf("%q cannot be mounted because it is located inside %q", dest, invalid) } } @@ -757,41 +701,6 @@ func pivotRoot(rootfs string) error { } func msMoveRoot(rootfs string) error { - mountinfos, err := mount.GetMounts() - if err != nil { - return err - } - - absRootfs, err := filepath.Abs(rootfs) - if err != nil { - return err - } - - for _, info := range mountinfos { - p, err := filepath.Abs(info.Mountpoint) - if err != nil { - return err - } - // Umount every syfs and proc file systems, except those under the container rootfs - if (info.Fstype != "proc" && info.Fstype != "sysfs") || filepath.HasPrefix(p, absRootfs) { - continue - } - // Be sure umount events are not propagated to the host. - if err := unix.Mount("", p, "", unix.MS_SLAVE|unix.MS_REC, ""); err != nil { - return err - } - if err := unix.Unmount(p, unix.MNT_DETACH); err != nil { - if err != unix.EINVAL && err != unix.EPERM { - return err - } else { - // If we have not privileges for umounting (e.g. rootless), then - // cover the path. - if err := unix.Mount("tmpfs", p, "tmpfs", 0, ""); err != nil { - return err - } - } - } - } if err := unix.Mount(rootfs, "/", "", unix.MS_MOVE, ""); err != nil { return err } @@ -893,7 +802,10 @@ func remount(m *configs.Mount, rootfs string) error { if !strings.HasPrefix(dest, rootfs) { dest = filepath.Join(rootfs, dest) } - return unix.Mount(m.Source, dest, m.Device, uintptr(m.Flags|unix.MS_REMOUNT), "") + if err := unix.Mount(m.Source, dest, m.Device, uintptr(m.Flags|unix.MS_REMOUNT), ""); err != nil { + return err + } + return nil } // Do the mount operation followed by additional mounts required to take care @@ -924,18 +836,3 @@ func mountPropagate(m *configs.Mount, rootfs string, mountLabel string) error { } return nil } - -func mountNewCgroup(m *configs.Mount) error { - var ( - data = m.Data - source = m.Source - ) - if data == "systemd" { - data = cgroups.CgroupNamePrefix + data - source = "systemd" - } - if err := unix.Mount(source, m.Destination, m.Device, uintptr(m.Flags), data); err != nil { - return err - } - return nil -} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/setns_init_linux.go b/vendor/github.com/opencontainers/runc/libcontainer/setns_init_linux.go index 6613bb65cb..096c601e76 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/setns_init_linux.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/setns_init_linux.go @@ -5,14 +5,12 @@ package libcontainer import ( "fmt" "os" - "runtime" "github.com/opencontainers/runc/libcontainer/apparmor" "github.com/opencontainers/runc/libcontainer/keys" "github.com/opencontainers/runc/libcontainer/seccomp" "github.com/opencontainers/runc/libcontainer/system" "github.com/opencontainers/selinux/go-selinux/label" - "github.com/pkg/errors" "golang.org/x/sys/unix" ) @@ -30,19 +28,10 @@ func (l *linuxSetnsInit) getSessionRingName() string { } func (l *linuxSetnsInit) Init() error { - runtime.LockOSThread() - defer runtime.UnlockOSThread() - if !l.config.Config.NoNewKeyring { - // Do not inherit the parent's session keyring. + // do not inherit the parent's session keyring if _, err := keys.JoinSessionKeyring(l.getSessionRingName()); err != nil { - // Same justification as in standart_init_linux.go as to why we - // don't bail on ENOSYS. - // - // TODO(cyphar): And we should have logging here too. - if errors.Cause(err) != unix.ENOSYS { - return errors.Wrap(err, "join session keyring") - } + return err } } if l.config.CreateConsole { @@ -58,10 +47,6 @@ func (l *linuxSetnsInit) Init() error { return err } } - if err := label.SetProcessLabel(l.config.ProcessLabel); err != nil { - return err - } - defer label.SetProcessLabel("") // Without NoNewPrivileges seccomp is a privileged operation, so we need to // do this before dropping capabilities; otherwise do it as late as possible // just before execve so as few syscalls take place after it as possible. @@ -76,6 +61,9 @@ func (l *linuxSetnsInit) Init() error { if err := apparmor.ApplyProfile(l.config.AppArmorProfile); err != nil { return err } + if err := label.SetProcessLabel(l.config.ProcessLabel); err != nil { + return err + } // Set seccomp as close to execve as possible, so as few syscalls take // place afterward (reducing the amount of syscalls that users need to // enable in their seccomp profiles). diff --git a/vendor/github.com/opencontainers/runc/libcontainer/specconv/example.go b/vendor/github.com/opencontainers/runc/libcontainer/specconv/example.go index 827ca9e781..c113b337f3 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/specconv/example.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/specconv/example.go @@ -156,7 +156,7 @@ func Example() *specs.Spec { } // ToRootless converts the given spec file into one that should work with -// rootless containers (euid != 0), by removing incompatible options and adding others that +// rootless containers, by removing incompatible options and adding others that // are needed. func ToRootless(spec *specs.Spec) { var namespaces []specs.LinuxNamespace diff --git a/vendor/github.com/opencontainers/runc/libcontainer/specconv/spec_linux.go b/vendor/github.com/opencontainers/runc/libcontainer/specconv/spec_linux.go index f68cac011a..98fd2e6321 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/specconv/spec_linux.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/specconv/spec_linux.go @@ -28,7 +28,6 @@ var namespaceMapping = map[specs.LinuxNamespaceType]configs.NamespaceType{ specs.UserNamespace: configs.NEWUSER, specs.IPCNamespace: configs.NEWIPC, specs.UTSNamespace: configs.NEWUTS, - specs.CgroupNamespace: configs.NEWCGROUP, } var mountPropagationMapping = map[string]int{ @@ -149,8 +148,7 @@ type CreateOpts struct { NoPivotRoot bool NoNewKeyring bool Spec *specs.Spec - RootlessEUID bool - RootlessCgroups bool + Rootless bool } // CreateLibcontainerConfig creates a new libcontainer configuration from a @@ -178,14 +176,13 @@ func CreateLibcontainerConfig(opts *CreateOpts) (*configs.Config, error) { labels = append(labels, fmt.Sprintf("%s=%s", k, v)) } config := &configs.Config{ - Rootfs: rootfsPath, - NoPivotRoot: opts.NoPivotRoot, - Readonlyfs: spec.Root.Readonly, - Hostname: spec.Hostname, - Labels: append(labels, fmt.Sprintf("bundle=%s", cwd)), - NoNewKeyring: opts.NoNewKeyring, - RootlessEUID: opts.RootlessEUID, - RootlessCgroups: opts.RootlessCgroups, + Rootfs: rootfsPath, + NoPivotRoot: opts.NoPivotRoot, + Readonlyfs: spec.Root.Readonly, + Hostname: spec.Hostname, + Labels: append(labels, fmt.Sprintf("bundle=%s", cwd)), + NoNewKeyring: opts.NoNewKeyring, + Rootless: opts.Rootless, } exists := false @@ -219,7 +216,7 @@ func CreateLibcontainerConfig(opts *CreateOpts) (*configs.Config, error) { } config.Namespaces.Add(t, ns.Path) } - if config.Namespaces.Contains(configs.NEWNET) && config.Namespaces.PathOf(configs.NEWNET) == "" { + if config.Namespaces.Contains(configs.NEWNET) { config.Networks = []*configs.Network{ { Type: "loopback", @@ -236,56 +233,49 @@ func CreateLibcontainerConfig(opts *CreateOpts) (*configs.Config, error) { config.MountLabel = spec.Linux.MountLabel config.Sysctl = spec.Linux.Sysctl if spec.Linux.Seccomp != nil { - seccomp, err := SetupSeccomp(spec.Linux.Seccomp) + seccomp, err := setupSeccomp(spec.Linux.Seccomp) if err != nil { return nil, err } config.Seccomp = seccomp } - if spec.Linux.IntelRdt != nil { - config.IntelRdt = &configs.IntelRdt{} - if spec.Linux.IntelRdt.L3CacheSchema != "" { - config.IntelRdt.L3CacheSchema = spec.Linux.IntelRdt.L3CacheSchema - } - if spec.Linux.IntelRdt.MemBwSchema != "" { - config.IntelRdt.MemBwSchema = spec.Linux.IntelRdt.MemBwSchema - } - } } - if spec.Process != nil { - config.OomScoreAdj = spec.Process.OOMScoreAdj - if spec.Process.SelinuxLabel != "" { - config.ProcessLabel = spec.Process.SelinuxLabel - } - if spec.Process.Capabilities != nil { - config.Capabilities = &configs.Capabilities{ - Bounding: spec.Process.Capabilities.Bounding, - Effective: spec.Process.Capabilities.Effective, - Permitted: spec.Process.Capabilities.Permitted, - Inheritable: spec.Process.Capabilities.Inheritable, - Ambient: spec.Process.Capabilities.Ambient, - } + if spec.Process.SelinuxLabel != "" { + config.ProcessLabel = spec.Process.SelinuxLabel + } + if spec.Process != nil && spec.Process.OOMScoreAdj != nil { + config.OomScoreAdj = *spec.Process.OOMScoreAdj + } + if spec.Process.Capabilities != nil { + config.Capabilities = &configs.Capabilities{ + Bounding: spec.Process.Capabilities.Bounding, + Effective: spec.Process.Capabilities.Effective, + Permitted: spec.Process.Capabilities.Permitted, + Inheritable: spec.Process.Capabilities.Inheritable, + Ambient: spec.Process.Capabilities.Ambient, } } createHooks(spec, config) config.Version = specs.Version + if spec.Linux.IntelRdt != nil { + config.IntelRdt = &configs.IntelRdt{} + if spec.Linux.IntelRdt.L3CacheSchema != "" { + config.IntelRdt.L3CacheSchema = spec.Linux.IntelRdt.L3CacheSchema + } + } return config, nil } func createLibcontainerMount(cwd string, m specs.Mount) *configs.Mount { flags, pgflags, data, ext := parseMountOptions(m.Options) source := m.Source - device := m.Type - if flags&unix.MS_BIND != 0 { - if device == "" { - device = "bind" - } + if m.Type == "bind" { if !filepath.IsAbs(source) { source = filepath.Join(cwd, m.Source) } } return &configs.Mount{ - Device: device, + Device: m.Type, Source: source, Destination: m.Destination, Data: data, @@ -325,7 +315,7 @@ func createCgroupConfig(opts *CreateOpts) (*configs.Cgroup, error) { // for e.g. "system.slice:docker:1234" parts := strings.Split(myCgroupPath, ":") if len(parts) != 3 { - return nil, fmt.Errorf("expected cgroupsPath to be of format \"slice:prefix:name\" for systemd cgroups, got %q instead", myCgroupPath) + return nil, fmt.Errorf("expected cgroupsPath to be of format \"slice:prefix:name\" for systemd cgroups") } c.Parent = parts[0] c.ScopePrefix = parts[1] @@ -338,9 +328,12 @@ func createCgroupConfig(opts *CreateOpts) (*configs.Cgroup, error) { c.Path = myCgroupPath } - // In rootless containers, any attempt to make cgroup changes is likely to fail. - // libcontainer will validate this but ignores the error. - c.Resources.AllowedDevices = allowedDevices + // In rootless containers, any attempt to make cgroup changes will fail. + // libcontainer will validate this and we shouldn't add any cgroup options + // the user didn't specify. + if !opts.Rootless { + c.Resources.AllowedDevices = allowedDevices + } if spec.Linux != nil { r := spec.Linux.Resources if r == nil { @@ -493,8 +486,10 @@ func createCgroupConfig(opts *CreateOpts) (*configs.Cgroup, error) { } } } - // append the default allowed devices to the end of the list - c.Resources.Devices = append(c.Resources.Devices, allowedDevices...) + if !opts.Rootless { + // append the default allowed devices to the end of the list + c.Resources.Devices = append(c.Resources.Devices, allowedDevices...) + } return c, nil } @@ -737,7 +732,7 @@ func parseMountOptions(options []string) (int, []int, string, int) { return flag, pgflag, strings.Join(data, ","), extFlags } -func SetupSeccomp(config *specs.LinuxSeccomp) (*configs.Seccomp, error) { +func setupSeccomp(config *specs.LinuxSeccomp) (*configs.Seccomp, error) { if config == nil { return nil, nil } diff --git a/vendor/github.com/opencontainers/runc/libcontainer/standard_init_linux.go b/vendor/github.com/opencontainers/runc/libcontainer/standard_init_linux.go index ad7ee8d8c8..e74d800256 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/standard_init_linux.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/standard_init_linux.go @@ -6,7 +6,6 @@ import ( "fmt" "os" "os/exec" - "runtime" "syscall" //only for Exec "github.com/opencontainers/runc/libcontainer/apparmor" @@ -15,7 +14,6 @@ import ( "github.com/opencontainers/runc/libcontainer/seccomp" "github.com/opencontainers/runc/libcontainer/system" "github.com/opencontainers/selinux/go-selinux/label" - "github.com/pkg/errors" "golang.org/x/sys/unix" ) @@ -45,31 +43,17 @@ func (l *linuxStandardInit) getSessionRingParams() (string, uint32, uint32) { } func (l *linuxStandardInit) Init() error { - runtime.LockOSThread() - defer runtime.UnlockOSThread() if !l.config.Config.NoNewKeyring { ringname, keepperms, newperms := l.getSessionRingParams() // Do not inherit the parent's session keyring. - if sessKeyId, err := keys.JoinSessionKeyring(ringname); err != nil { - // If keyrings aren't supported then it is likely we are on an - // older kernel (or inside an LXC container). While we could bail, - // the security feature we are using here is best-effort (it only - // really provides marginal protection since VFS credentials are - // the only significant protection of keyrings). - // - // TODO(cyphar): Log this so people know what's going on, once we - // have proper logging in 'runc init'. - if errors.Cause(err) != unix.ENOSYS { - return errors.Wrap(err, "join session keyring") - } - } else { - // Make session keyring searcheable. If we've gotten this far we - // bail on any error -- we don't want to have a keyring with bad - // permissions. - if err := keys.ModKeyringPerm(sessKeyId, keepperms, newperms); err != nil { - return errors.Wrap(err, "mod keyring permissions") - } + sessKeyId, err := keys.JoinSessionKeyring(ringname) + if err != nil { + return err + } + // Make session keyring searcheable. + if err := keys.ModKeyringPerm(sessKeyId, keepperms, newperms); err != nil { + return err } } @@ -92,7 +76,7 @@ func (l *linuxStandardInit) Init() error { return err } if err := system.Setctty(); err != nil { - return errors.Wrap(err, "setctty") + return err } } @@ -105,47 +89,46 @@ func (l *linuxStandardInit) Init() error { if hostname := l.config.Config.Hostname; hostname != "" { if err := unix.Sethostname([]byte(hostname)); err != nil { - return errors.Wrap(err, "sethostname") + return err } } if err := apparmor.ApplyProfile(l.config.AppArmorProfile); err != nil { - return errors.Wrap(err, "apply apparmor profile") + return err + } + if err := label.SetProcessLabel(l.config.ProcessLabel); err != nil { + return err } for key, value := range l.config.Config.Sysctl { if err := writeSystemProperty(key, value); err != nil { - return errors.Wrapf(err, "write sysctl key %s", key) + return err } } for _, path := range l.config.Config.ReadonlyPaths { if err := readonlyPath(path); err != nil { - return errors.Wrapf(err, "readonly path %s", path) + return err } } for _, path := range l.config.Config.MaskPaths { if err := maskPath(path, l.config.Config.MountLabel); err != nil { - return errors.Wrapf(err, "mask path %s", path) + return err } } pdeath, err := system.GetParentDeathSignal() if err != nil { - return errors.Wrap(err, "get pdeath signal") + return err } if l.config.NoNewPrivileges { if err := unix.Prctl(unix.PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); err != nil { - return errors.Wrap(err, "set nonewprivileges") + return err } } // Tell our parent that we're ready to Execv. This must be done before the // Seccomp rules have been applied, because we need to be able to read and // write to a socket. if err := syncParentReady(l.pipe); err != nil { - return errors.Wrap(err, "sync ready") - } - if err := label.SetProcessLabel(l.config.ProcessLabel); err != nil { - return errors.Wrap(err, "set process label") + return err } - defer label.SetProcessLabel("") // Without NoNewPrivileges seccomp is a privileged operation, so we need to // do this before dropping capabilities; otherwise do it as late as possible // just before execve so as few syscalls take place after it as possible. @@ -160,7 +143,7 @@ func (l *linuxStandardInit) Init() error { // finalizeNamespace can change user/group which clears the parent death // signal, so we restore it here. if err := pdeath.Restore(); err != nil { - return errors.Wrap(err, "restore pdeath signal") + return err } // Compare the parent from the initial start of the init process and make // sure that it did not change. if the parent changes that means it died diff --git a/vendor/github.com/opencontainers/runc/libcontainer/state_linux.go b/vendor/github.com/opencontainers/runc/libcontainer/state_linux.go index 5c16a423f7..b45ce23e4a 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/state_linux.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/state_linux.go @@ -8,6 +8,7 @@ import ( "path/filepath" "github.com/opencontainers/runc/libcontainer/configs" + "github.com/opencontainers/runc/libcontainer/utils" "github.com/sirupsen/logrus" "golang.org/x/sys/unix" @@ -62,9 +63,12 @@ func destroy(c *linuxContainer) error { func runPoststopHooks(c *linuxContainer) error { if c.config.Hooks != nil { - s, err := c.currentOCIState() - if err != nil { - return err + bundle, annotations := utils.Annotations(c.config.Labels) + s := configs.HookState{ + Version: c.config.Version, + ID: c.id, + Bundle: bundle, + Annotations: annotations, } for _, hook := range c.config.Hooks.Poststop { if err := hook.Run(s); err != nil { diff --git a/vendor/github.com/opencontainers/runc/libcontainer/sync.go b/vendor/github.com/opencontainers/runc/libcontainer/sync.go index a8704a2679..cf7b45bc32 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/sync.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/sync.go @@ -41,7 +41,10 @@ type syncT struct { // writeSync is used to write to a synchronisation pipe. An error is returned // if there was a problem writing the payload. func writeSync(pipe io.Writer, sync syncType) error { - return utils.WriteJSON(pipe, syncT{sync}) + if err := utils.WriteJSON(pipe, syncT{sync}); err != nil { + return err + } + return nil } // readSync is used to read from a synchronisation pipe. An error is returned diff --git a/vendor/github.com/opencontainers/runc/libcontainer/system/linux.go b/vendor/github.com/opencontainers/runc/libcontainer/system/linux.go index a4ae8901ac..5f124cd8bb 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/system/linux.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/system/linux.go @@ -3,12 +3,13 @@ package system import ( + "bufio" + "fmt" "os" "os/exec" "syscall" // only for exec "unsafe" - "github.com/opencontainers/runc/libcontainer/user" "golang.org/x/sys/unix" ) @@ -101,43 +102,34 @@ func Setctty() error { } // RunningInUserNS detects whether we are currently running in a user namespace. -// Originally copied from github.com/lxc/lxd/shared/util.go +// Copied from github.com/lxc/lxd/shared/util.go func RunningInUserNS() bool { - uidmap, err := user.CurrentProcessUIDMap() + file, err := os.Open("/proc/self/uid_map") if err != nil { // This kernel-provided file only exists if user namespaces are supported return false } - return UIDMapInUserNS(uidmap) -} + defer file.Close() + + buf := bufio.NewReader(file) + l, _, err := buf.ReadLine() + if err != nil { + return false + } -func UIDMapInUserNS(uidmap []user.IDMap) bool { + line := string(l) + var a, b, c int64 + fmt.Sscanf(line, "%d %d %d", &a, &b, &c) /* * We assume we are in the initial user namespace if we have a full * range - 4294967295 uids starting at uid 0. */ - if len(uidmap) == 1 && uidmap[0].ID == 0 && uidmap[0].ParentID == 0 && uidmap[0].Count == 4294967295 { + if a == 0 && b == 0 && c == 4294967295 { return false } return true } -// GetParentNSeuid returns the euid within the parent user namespace -func GetParentNSeuid() int64 { - euid := int64(os.Geteuid()) - uidmap, err := user.CurrentProcessUIDMap() - if err != nil { - // This kernel-provided file only exists if user namespaces are supported - return euid - } - for _, um := range uidmap { - if um.ID <= euid && euid <= um.ID+um.Count-1 { - return um.ParentID + euid - um.ID - } - } - return euid -} - // SetSubreaper sets the value i as the subreaper setting for the calling process func SetSubreaper(i int) error { return unix.Prctl(PR_SET_CHILD_SUBREAPER, uintptr(i), 0, 0, 0) diff --git a/vendor/github.com/opencontainers/runc/libcontainer/system/unsupported.go b/vendor/github.com/opencontainers/runc/libcontainer/system/unsupported.go index b94be74a66..e7cfd62b29 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/system/unsupported.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/system/unsupported.go @@ -2,26 +2,8 @@ package system -import ( - "os" - - "github.com/opencontainers/runc/libcontainer/user" -) - // RunningInUserNS is a stub for non-Linux systems // Always returns false func RunningInUserNS() bool { return false } - -// UIDMapInUserNS is a stub for non-Linux systems -// Always returns false -func UIDMapInUserNS(uidmap []user.IDMap) bool { - return false -} - -// GetParentNSeuid returns the euid within the parent user namespace -// Always returns os.Geteuid on non-linux -func GetParentNSeuid() int { - return os.Geteuid() -} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/user/lookup_unix.go b/vendor/github.com/opencontainers/runc/libcontainer/user/lookup_unix.go index 92b5ae8de0..c45e300411 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/user/lookup_unix.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/user/lookup_unix.go @@ -5,7 +5,6 @@ package user import ( "io" "os" - "strconv" "golang.org/x/sys/unix" ) @@ -115,30 +114,3 @@ func CurrentUser() (User, error) { func CurrentGroup() (Group, error) { return LookupGid(unix.Getgid()) } - -func currentUserSubIDs(fileName string) ([]SubID, error) { - u, err := CurrentUser() - if err != nil { - return nil, err - } - filter := func(entry SubID) bool { - return entry.Name == u.Name || entry.Name == strconv.Itoa(u.Uid) - } - return ParseSubIDFileFilter(fileName, filter) -} - -func CurrentUserSubUIDs() ([]SubID, error) { - return currentUserSubIDs("/etc/subuid") -} - -func CurrentUserSubGIDs() ([]SubID, error) { - return currentUserSubIDs("/etc/subgid") -} - -func CurrentProcessUIDMap() ([]IDMap, error) { - return ParseIDMapFile("/proc/self/uid_map") -} - -func CurrentProcessGIDMap() ([]IDMap, error) { - return ParseIDMapFile("/proc/self/gid_map") -} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/user/user.go b/vendor/github.com/opencontainers/runc/libcontainer/user/user.go index 7b912bbf8b..93414516ca 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/user/user.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/user/user.go @@ -75,29 +75,12 @@ func groupFromOS(g *user.Group) (Group, error) { return newGroup, nil } -// SubID represents an entry in /etc/sub{u,g}id -type SubID struct { - Name string - SubID int64 - Count int64 -} - -// IDMap represents an entry in /proc/PID/{u,g}id_map -type IDMap struct { - ID int64 - ParentID int64 - Count int64 -} - func parseLine(line string, v ...interface{}) { - parseParts(strings.Split(line, ":"), v...) -} - -func parseParts(parts []string, v ...interface{}) { - if len(parts) == 0 { + if line == "" { return } + parts := strings.Split(line, ":") for i, p := range parts { // Ignore cases where we don't have enough fields to populate the arguments. // Some configuration files like to misbehave. @@ -113,8 +96,6 @@ func parseParts(parts []string, v ...interface{}) { case *int: // "numbers", with conversion errors ignored because of some misbehaving configuration files. *e, _ = strconv.Atoi(p) - case *int64: - *e, _ = strconv.ParseInt(p, 10, 64) case *[]string: // Comma-separated lists. if p != "" { @@ -124,7 +105,7 @@ func parseParts(parts []string, v ...interface{}) { } default: // Someone goof'd when writing code using this function. Scream so they can hear us. - panic(fmt.Sprintf("parseLine only accepts {*string, *int, *int64, *[]string} as arguments! %#v is not a pointer!", e)) + panic(fmt.Sprintf("parseLine only accepts {*string, *int, *[]string} as arguments! %#v is not a pointer!", e)) } } } @@ -498,111 +479,3 @@ func GetAdditionalGroupsPath(additionalGroups []string, groupPath string) ([]int } return GetAdditionalGroups(additionalGroups, group) } - -func ParseSubIDFile(path string) ([]SubID, error) { - subid, err := os.Open(path) - if err != nil { - return nil, err - } - defer subid.Close() - return ParseSubID(subid) -} - -func ParseSubID(subid io.Reader) ([]SubID, error) { - return ParseSubIDFilter(subid, nil) -} - -func ParseSubIDFileFilter(path string, filter func(SubID) bool) ([]SubID, error) { - subid, err := os.Open(path) - if err != nil { - return nil, err - } - defer subid.Close() - return ParseSubIDFilter(subid, filter) -} - -func ParseSubIDFilter(r io.Reader, filter func(SubID) bool) ([]SubID, error) { - if r == nil { - return nil, fmt.Errorf("nil source for subid-formatted data") - } - - var ( - s = bufio.NewScanner(r) - out = []SubID{} - ) - - for s.Scan() { - if err := s.Err(); err != nil { - return nil, err - } - - line := strings.TrimSpace(s.Text()) - if line == "" { - continue - } - - // see: man 5 subuid - p := SubID{} - parseLine(line, &p.Name, &p.SubID, &p.Count) - - if filter == nil || filter(p) { - out = append(out, p) - } - } - - return out, nil -} - -func ParseIDMapFile(path string) ([]IDMap, error) { - r, err := os.Open(path) - if err != nil { - return nil, err - } - defer r.Close() - return ParseIDMap(r) -} - -func ParseIDMap(r io.Reader) ([]IDMap, error) { - return ParseIDMapFilter(r, nil) -} - -func ParseIDMapFileFilter(path string, filter func(IDMap) bool) ([]IDMap, error) { - r, err := os.Open(path) - if err != nil { - return nil, err - } - defer r.Close() - return ParseIDMapFilter(r, filter) -} - -func ParseIDMapFilter(r io.Reader, filter func(IDMap) bool) ([]IDMap, error) { - if r == nil { - return nil, fmt.Errorf("nil source for idmap-formatted data") - } - - var ( - s = bufio.NewScanner(r) - out = []IDMap{} - ) - - for s.Scan() { - if err := s.Err(); err != nil { - return nil, err - } - - line := strings.TrimSpace(s.Text()) - if line == "" { - continue - } - - // see: man 7 user_namespaces - p := IDMap{} - parseParts(strings.Fields(line), &p.ID, &p.ParentID, &p.Count) - - if filter == nil || filter(p) { - out = append(out, p) - } - } - - return out, nil -} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/utils/utils.go b/vendor/github.com/opencontainers/runc/libcontainer/utils/utils.go index 40ccfaa1a0..baa54c9ba2 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/utils/utils.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/utils/utils.go @@ -1,6 +1,8 @@ package utils import ( + "crypto/rand" + "encoding/hex" "encoding/json" "io" "os" @@ -15,6 +17,19 @@ const ( exitSignalOffset = 128 ) +// GenerateRandomName returns a new name joined with a prefix. This size +// specified is used to truncate the randomly generated value +func GenerateRandomName(prefix string, size int) (string, error) { + id := make([]byte, 32) + if _, err := io.ReadFull(rand.Reader, id); err != nil { + return "", err + } + if size > 64 { + size = 64 + } + return prefix + hex.EncodeToString(id)[:size], nil +} + // ResolveRootfs ensures that the current working directory is // not a symlink and returns the absolute path to the rootfs func ResolveRootfs(uncleanRootfs string) (string, error) {