diff --git a/go.mod b/go.mod index 8b392128d17e7..cccf9f35991a8 100644 --- a/go.mod +++ b/go.mod @@ -21,6 +21,7 @@ require ( github.com/armon/go-socks5 v0.0.0-20160902184237-e75332964ef5 github.com/blang/semver/v4 v4.0.0 github.com/container-storage-interface/spec v1.8.0 + github.com/containerd/cgroups v1.1.0 github.com/coredns/corefile-migration v1.0.20 github.com/coreos/go-oidc v2.2.1+incompatible github.com/coreos/go-systemd/v22 v22.5.0 @@ -151,7 +152,6 @@ require ( github.com/chai2010/gettext-go v1.0.2 // indirect github.com/checkpoint-restore/go-criu/v5 v5.3.0 // indirect github.com/cilium/ebpf v0.9.1 // indirect - github.com/containerd/cgroups v1.1.0 // indirect github.com/containerd/console v1.0.3 // indirect github.com/containerd/ttrpc v1.2.2 // indirect github.com/coredns/caddy v1.1.1 // indirect diff --git a/pkg/kubelet/kuberuntime/kuberuntime_container_linux.go b/pkg/kubelet/kuberuntime/kuberuntime_container_linux.go index c600d49bc258d..c76389c0d9ff9 100644 --- a/pkg/kubelet/kuberuntime/kuberuntime_container_linux.go +++ b/pkg/kubelet/kuberuntime/kuberuntime_container_linux.go @@ -20,21 +20,26 @@ limitations under the License. package kuberuntime import ( + "errors" "fmt" - cadvisorv1 "github.com/google/cadvisor/info/v1" - kubeapiqos "k8s.io/kubernetes/pkg/apis/core/v1/helper/qos" "math" "os" + "path/filepath" "strconv" + "sync" "time" + "github.com/containerd/cgroups" + cadvisorv1 "github.com/google/cadvisor/info/v1" libcontainercgroups "github.com/opencontainers/runc/libcontainer/cgroups" + v1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/resource" utilfeature "k8s.io/apiserver/pkg/util/feature" runtimeapi "k8s.io/cri-api/pkg/apis/runtime/v1" "k8s.io/klog/v2" v1helper "k8s.io/kubernetes/pkg/apis/core/v1/helper" + kubeapiqos "k8s.io/kubernetes/pkg/apis/core/v1/helper/qos" kubefeatures "k8s.io/kubernetes/pkg/features" "k8s.io/kubernetes/pkg/kubelet/cm" kubecontainer "k8s.io/kubernetes/pkg/kubelet/container" @@ -305,6 +310,36 @@ var isCgroup2UnifiedMode = func() bool { return libcontainercgroups.IsCgroup2UnifiedMode() } +var ( + swapControllerAvailability bool + swapControllerAvailabilityOnce sync.Once +) + +func swapControllerAvailable() bool { + // See https://github.com/containerd/containerd/pull/7838/ + swapControllerAvailabilityOnce.Do(func() { + const warn = "Failed to detect the availability of the swap controller, assuming not available" + p := "/sys/fs/cgroup/memory/memory.memsw.limit_in_bytes" + if libcontainercgroups.IsCgroup2UnifiedMode() { + // memory.swap.max does not exist in the cgroup root, so we check /sys/fs/cgroup//memory.swap.max + _, unified, err := cgroups.ParseCgroupFileUnified("/proc/self/cgroup") + if err != nil { + klog.V(5).ErrorS(fmt.Errorf("failed to parse /proc/self/cgroup: %w", err), warn) + return + } + p = filepath.Join("/sys/fs/cgroup", unified, "memory.swap.max") + } + if _, err := os.Stat(p); err != nil { + if !errors.Is(err, os.ErrNotExist) { + klog.V(5).ErrorS(err, warn) + } + return + } + swapControllerAvailability = true + }) + return swapControllerAvailability +} + type swapConfigurationHelper struct { machineInfo cadvisorv1.MachineInfo } @@ -337,10 +372,12 @@ func (m swapConfigurationHelper) ConfigureLimitedSwap(lcr *runtimeapi.LinuxConta func (m swapConfigurationHelper) ConfigureNoSwap(lcr *runtimeapi.LinuxContainerResources) { if !isCgroup2UnifiedMode() { - // memorySwapLimit = total permitted memory+swap; if equal to memory limit, => 0 swap above memory limit - // Some swapping is still possible. - // Note that if memory limit is 0, memory swap limit is ignored. - lcr.MemorySwapLimitInBytes = lcr.MemoryLimitInBytes + if swapControllerAvailable() { + // memorySwapLimit = total permitted memory+swap; if equal to memory limit, => 0 swap above memory limit + // Some swapping is still possible. + // Note that if memory limit is 0, memory swap limit is ignored. + lcr.MemorySwapLimitInBytes = lcr.MemoryLimitInBytes + } return } diff --git a/vendor/github.com/containerd/cgroups/.gitignore b/vendor/github.com/containerd/cgroups/.gitignore new file mode 100644 index 0000000000000..3465c14cf7783 --- /dev/null +++ b/vendor/github.com/containerd/cgroups/.gitignore @@ -0,0 +1,2 @@ +example/example +cmd/cgctl/cgctl diff --git a/vendor/github.com/containerd/cgroups/Makefile b/vendor/github.com/containerd/cgroups/Makefile new file mode 100644 index 0000000000000..19e6607561d5c --- /dev/null +++ b/vendor/github.com/containerd/cgroups/Makefile @@ -0,0 +1,24 @@ +# Copyright The containerd Authors. + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at + +# http://www.apache.org/licenses/LICENSE-2.0 + +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +PACKAGES=$(shell go list ./... | grep -v /vendor/) + +all: cgutil + go build -v + +cgutil: + cd cmd/cgctl && go build -v + +proto: + protobuild --quiet ${PACKAGES} diff --git a/vendor/github.com/containerd/cgroups/Protobuild.toml b/vendor/github.com/containerd/cgroups/Protobuild.toml new file mode 100644 index 0000000000000..1c4c802fe12b6 --- /dev/null +++ b/vendor/github.com/containerd/cgroups/Protobuild.toml @@ -0,0 +1,46 @@ +version = "unstable" +generator = "gogoctrd" +plugins = ["grpc"] + +# Control protoc include paths. Below are usually some good defaults, but feel +# free to try it without them if it works for your project. +[includes] + # Include paths that will be added before all others. Typically, you want to + # treat the root of the project as an include, but this may not be necessary. + # before = ["."] + + # Paths that should be treated as include roots in relation to the vendor + # directory. These will be calculated with the vendor directory nearest the + # target package. + # vendored = ["github.com/gogo/protobuf"] + packages = ["github.com/gogo/protobuf"] + + # Paths that will be added untouched to the end of the includes. We use + # `/usr/local/include` to pickup the common install location of protobuf. + # This is the default. + after = ["/usr/local/include", "/usr/include"] + +# This section maps protobuf imports to Go packages. These will become +# `-M` directives in the call to the go protobuf generator. +[packages] + "gogoproto/gogo.proto" = "github.com/gogo/protobuf/gogoproto" + "google/protobuf/any.proto" = "github.com/gogo/protobuf/types" + "google/protobuf/descriptor.proto" = "github.com/gogo/protobuf/protoc-gen-gogo/descriptor" + "google/protobuf/field_mask.proto" = "github.com/gogo/protobuf/types" + "google/protobuf/timestamp.proto" = "github.com/gogo/protobuf/types" + +# Aggregrate the API descriptors to lock down API changes. +[[descriptors]] +prefix = "github.com/containerd/cgroups/stats/v1" +target = "stats/v1/metrics.pb.txt" +ignore_files = [ + "google/protobuf/descriptor.proto", + "gogoproto/gogo.proto" +] +[[descriptors]] +prefix = "github.com/containerd/cgroups/v2/stats" +target = "v2/stats/metrics.pb.txt" +ignore_files = [ + "google/protobuf/descriptor.proto", + "gogoproto/gogo.proto" +] diff --git a/vendor/github.com/containerd/cgroups/README.md b/vendor/github.com/containerd/cgroups/README.md new file mode 100644 index 0000000000000..d2073af3abc84 --- /dev/null +++ b/vendor/github.com/containerd/cgroups/README.md @@ -0,0 +1,204 @@ +# cgroups + +[![Build Status](https://github.com/containerd/cgroups/workflows/CI/badge.svg)](https://github.com/containerd/cgroups/actions?query=workflow%3ACI) +[![codecov](https://codecov.io/gh/containerd/cgroups/branch/main/graph/badge.svg)](https://codecov.io/gh/containerd/cgroups) +[![GoDoc](https://godoc.org/github.com/containerd/cgroups?status.svg)](https://godoc.org/github.com/containerd/cgroups) +[![Go Report Card](https://goreportcard.com/badge/github.com/containerd/cgroups)](https://goreportcard.com/report/github.com/containerd/cgroups) + +Go package for creating, managing, inspecting, and destroying cgroups. +The resources format for settings on the cgroup uses the OCI runtime-spec found +[here](https://github.com/opencontainers/runtime-spec). + +## Examples (v1) + +### Create a new cgroup + +This creates a new cgroup using a static path for all subsystems under `/test`. + +* /sys/fs/cgroup/cpu/test +* /sys/fs/cgroup/memory/test +* etc.... + +It uses a single hierarchy and specifies cpu shares as a resource constraint and +uses the v1 implementation of cgroups. + + +```go +shares := uint64(100) +control, err := cgroups.New(cgroups.V1, cgroups.StaticPath("/test"), &specs.LinuxResources{ + CPU: &specs.LinuxCPU{ + Shares: &shares, + }, +}) +defer control.Delete() +``` + +### Create with systemd slice support + + +```go +control, err := cgroups.New(cgroups.Systemd, cgroups.Slice("system.slice", "runc-test"), &specs.LinuxResources{ + CPU: &specs.CPU{ + Shares: &shares, + }, +}) + +``` + +### Load an existing cgroup + +```go +control, err = cgroups.Load(cgroups.V1, cgroups.StaticPath("/test")) +``` + +### Add a process to the cgroup + +```go +if err := control.Add(cgroups.Process{Pid:1234}); err != nil { +} +``` + +### Update the cgroup + +To update the resources applied in the cgroup + +```go +shares = uint64(200) +if err := control.Update(&specs.LinuxResources{ + CPU: &specs.LinuxCPU{ + Shares: &shares, + }, +}); err != nil { +} +``` + +### Freeze and Thaw the cgroup + +```go +if err := control.Freeze(); err != nil { +} +if err := control.Thaw(); err != nil { +} +``` + +### List all processes in the cgroup or recursively + +```go +processes, err := control.Processes(cgroups.Devices, recursive) +``` + +### Get Stats on the cgroup + +```go +stats, err := control.Stat() +``` + +By adding `cgroups.IgnoreNotExist` all non-existent files will be ignored, e.g. swap memory stats without swap enabled +```go +stats, err := control.Stat(cgroups.IgnoreNotExist) +``` + +### Move process across cgroups + +This allows you to take processes from one cgroup and move them to another. + +```go +err := control.MoveTo(destination) +``` + +### Create subcgroup + +```go +subCgroup, err := control.New("child", resources) +``` + +### Registering for memory events + +This allows you to get notified by an eventfd for v1 memory cgroups events. + +```go +event := cgroups.MemoryThresholdEvent(50 * 1024 * 1024, false) +efd, err := control.RegisterMemoryEvent(event) +``` + +```go +event := cgroups.MemoryPressureEvent(cgroups.MediumPressure, cgroups.DefaultMode) +efd, err := control.RegisterMemoryEvent(event) +``` + +```go +efd, err := control.OOMEventFD() +// or by using RegisterMemoryEvent +event := cgroups.OOMEvent() +efd, err := control.RegisterMemoryEvent(event) +``` + +## Examples (v2/unified) + +### Check that the current system is running cgroups v2 + +```go +var cgroupV2 bool +if cgroups.Mode() == cgroups.Unified { + cgroupV2 = true +} +``` + +### Create a new cgroup + +This creates a new systemd v2 cgroup slice. Systemd slices consider ["-" a special character](https://www.freedesktop.org/software/systemd/man/systemd.slice.html), +so the resulting slice would be located here on disk: + +* /sys/fs/cgroup/my.slice/my-cgroup.slice/my-cgroup-abc.slice + +```go +import ( + cgroupsv2 "github.com/containerd/cgroups/v2" + specs "github.com/opencontainers/runtime-spec/specs-go" +) + +res := cgroupsv2.Resources{} +// dummy PID of -1 is used for creating a "general slice" to be used as a parent cgroup. +// see https://github.com/containerd/cgroups/blob/1df78138f1e1e6ee593db155c6b369466f577651/v2/manager.go#L732-L735 +m, err := cgroupsv2.NewSystemd("/", "my-cgroup-abc.slice", -1, &res) +if err != nil { + return err +} +``` + +### Load an existing cgroup + +```go +m, err := cgroupsv2.LoadSystemd("/", "my-cgroup-abc.slice") +if err != nil { + return err +} +``` + +### Delete a cgroup + +```go +m, err := cgroupsv2.LoadSystemd("/", "my-cgroup-abc.slice") +if err != nil { + return err +} +err = m.DeleteSystemd() +if err != nil { + return err +} +``` + +### Attention + +All static path should not include `/sys/fs/cgroup/` prefix, it should start with your own cgroups name + +## Project details + +Cgroups is a containerd sub-project, licensed under the [Apache 2.0 license](./LICENSE). +As a containerd sub-project, you will find the: + + * [Project governance](https://github.com/containerd/project/blob/main/GOVERNANCE.md), + * [Maintainers](https://github.com/containerd/project/blob/main/MAINTAINERS), + * and [Contributing guidelines](https://github.com/containerd/project/blob/main/CONTRIBUTING.md) + +information in our [`containerd/project`](https://github.com/containerd/project) repository. diff --git a/vendor/github.com/containerd/cgroups/blkio.go b/vendor/github.com/containerd/cgroups/blkio.go new file mode 100644 index 0000000000000..f59c75bb5d1f1 --- /dev/null +++ b/vendor/github.com/containerd/cgroups/blkio.go @@ -0,0 +1,361 @@ +/* + Copyright The containerd Authors. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package cgroups + +import ( + "bufio" + "fmt" + "io" + "os" + "path/filepath" + "strconv" + "strings" + + v1 "github.com/containerd/cgroups/stats/v1" + specs "github.com/opencontainers/runtime-spec/specs-go" +) + +// NewBlkio returns a Blkio controller given the root folder of cgroups. +// It may optionally accept other configuration options, such as ProcRoot(path) +func NewBlkio(root string, options ...func(controller *blkioController)) *blkioController { + ctrl := &blkioController{ + root: filepath.Join(root, string(Blkio)), + procRoot: "/proc", + } + for _, opt := range options { + opt(ctrl) + } + return ctrl +} + +// ProcRoot overrides the default location of the "/proc" filesystem +func ProcRoot(path string) func(controller *blkioController) { + return func(c *blkioController) { + c.procRoot = path + } +} + +type blkioController struct { + root string + procRoot string +} + +func (b *blkioController) Name() Name { + return Blkio +} + +func (b *blkioController) Path(path string) string { + return filepath.Join(b.root, path) +} + +func (b *blkioController) Create(path string, resources *specs.LinuxResources) error { + if err := os.MkdirAll(b.Path(path), defaultDirPerm); err != nil { + return err + } + if resources.BlockIO == nil { + return nil + } + for _, t := range createBlkioSettings(resources.BlockIO) { + if t.value != nil { + if err := retryingWriteFile( + filepath.Join(b.Path(path), "blkio."+t.name), + t.format(t.value), + defaultFilePerm, + ); err != nil { + return err + } + } + } + return nil +} + +func (b *blkioController) Update(path string, resources *specs.LinuxResources) error { + return b.Create(path, resources) +} + +func (b *blkioController) Stat(path string, stats *v1.Metrics) error { + stats.Blkio = &v1.BlkIOStat{} + + var settings []blkioStatSettings + + // Try to read CFQ stats available on all CFQ enabled kernels first + if _, err := os.Lstat(filepath.Join(b.Path(path), "blkio.io_serviced_recursive")); err == nil { + settings = []blkioStatSettings{ + { + name: "sectors_recursive", + entry: &stats.Blkio.SectorsRecursive, + }, + { + name: "io_service_bytes_recursive", + entry: &stats.Blkio.IoServiceBytesRecursive, + }, + { + name: "io_serviced_recursive", + entry: &stats.Blkio.IoServicedRecursive, + }, + { + name: "io_queued_recursive", + entry: &stats.Blkio.IoQueuedRecursive, + }, + { + name: "io_service_time_recursive", + entry: &stats.Blkio.IoServiceTimeRecursive, + }, + { + name: "io_wait_time_recursive", + entry: &stats.Blkio.IoWaitTimeRecursive, + }, + { + name: "io_merged_recursive", + entry: &stats.Blkio.IoMergedRecursive, + }, + { + name: "time_recursive", + entry: &stats.Blkio.IoTimeRecursive, + }, + } + } + + f, err := os.Open(filepath.Join(b.procRoot, "partitions")) + if err != nil { + return err + } + defer f.Close() + + devices, err := getDevices(f) + if err != nil { + return err + } + + var size int + for _, t := range settings { + if err := b.readEntry(devices, path, t.name, t.entry); err != nil { + return err + } + size += len(*t.entry) + } + if size > 0 { + return nil + } + + // Even the kernel is compiled with the CFQ scheduler, the cgroup may not use + // block devices with the CFQ scheduler. If so, we should fallback to throttle.* files. + settings = []blkioStatSettings{ + { + name: "throttle.io_serviced", + entry: &stats.Blkio.IoServicedRecursive, + }, + { + name: "throttle.io_service_bytes", + entry: &stats.Blkio.IoServiceBytesRecursive, + }, + } + for _, t := range settings { + if err := b.readEntry(devices, path, t.name, t.entry); err != nil { + return err + } + } + return nil +} + +func (b *blkioController) readEntry(devices map[deviceKey]string, path, name string, entry *[]*v1.BlkIOEntry) error { + f, err := os.Open(filepath.Join(b.Path(path), "blkio."+name)) + if err != nil { + return err + } + defer f.Close() + sc := bufio.NewScanner(f) + for sc.Scan() { + // format: dev type amount + fields := strings.FieldsFunc(sc.Text(), splitBlkIOStatLine) + if len(fields) < 3 { + if len(fields) == 2 && fields[0] == "Total" { + // skip total line + continue + } else { + return fmt.Errorf("invalid line found while parsing %s: %s", path, sc.Text()) + } + } + major, err := strconv.ParseUint(fields[0], 10, 64) + if err != nil { + return err + } + minor, err := strconv.ParseUint(fields[1], 10, 64) + if err != nil { + return err + } + op := "" + valueField := 2 + if len(fields) == 4 { + op = fields[2] + valueField = 3 + } + v, err := strconv.ParseUint(fields[valueField], 10, 64) + if err != nil { + return err + } + *entry = append(*entry, &v1.BlkIOEntry{ + Device: devices[deviceKey{major, minor}], + Major: major, + Minor: minor, + Op: op, + Value: v, + }) + } + return sc.Err() +} + +func createBlkioSettings(blkio *specs.LinuxBlockIO) []blkioSettings { + settings := []blkioSettings{} + + if blkio.Weight != nil { + settings = append(settings, + blkioSettings{ + name: "weight", + value: blkio.Weight, + format: uintf, + }) + } + if blkio.LeafWeight != nil { + settings = append(settings, + blkioSettings{ + name: "leaf_weight", + value: blkio.LeafWeight, + format: uintf, + }) + } + for _, wd := range blkio.WeightDevice { + if wd.Weight != nil { + settings = append(settings, + blkioSettings{ + name: "weight_device", + value: wd, + format: weightdev, + }) + } + if wd.LeafWeight != nil { + settings = append(settings, + blkioSettings{ + name: "leaf_weight_device", + value: wd, + format: weightleafdev, + }) + } + } + for _, t := range []struct { + name string + list []specs.LinuxThrottleDevice + }{ + { + name: "throttle.read_bps_device", + list: blkio.ThrottleReadBpsDevice, + }, + { + name: "throttle.read_iops_device", + list: blkio.ThrottleReadIOPSDevice, + }, + { + name: "throttle.write_bps_device", + list: blkio.ThrottleWriteBpsDevice, + }, + { + name: "throttle.write_iops_device", + list: blkio.ThrottleWriteIOPSDevice, + }, + } { + for _, td := range t.list { + settings = append(settings, blkioSettings{ + name: t.name, + value: td, + format: throttleddev, + }) + } + } + return settings +} + +type blkioSettings struct { + name string + value interface{} + format func(v interface{}) []byte +} + +type blkioStatSettings struct { + name string + entry *[]*v1.BlkIOEntry +} + +func uintf(v interface{}) []byte { + return []byte(strconv.FormatUint(uint64(*v.(*uint16)), 10)) +} + +func weightdev(v interface{}) []byte { + wd := v.(specs.LinuxWeightDevice) + return []byte(fmt.Sprintf("%d:%d %d", wd.Major, wd.Minor, *wd.Weight)) +} + +func weightleafdev(v interface{}) []byte { + wd := v.(specs.LinuxWeightDevice) + return []byte(fmt.Sprintf("%d:%d %d", wd.Major, wd.Minor, *wd.LeafWeight)) +} + +func throttleddev(v interface{}) []byte { + td := v.(specs.LinuxThrottleDevice) + return []byte(fmt.Sprintf("%d:%d %d", td.Major, td.Minor, td.Rate)) +} + +func splitBlkIOStatLine(r rune) bool { + return r == ' ' || r == ':' +} + +type deviceKey struct { + major, minor uint64 +} + +// getDevices makes a best effort attempt to read all the devices into a map +// keyed by major and minor number. Since devices may be mapped multiple times, +// we err on taking the first occurrence. +func getDevices(r io.Reader) (map[deviceKey]string, error) { + + var ( + s = bufio.NewScanner(r) + devices = make(map[deviceKey]string) + ) + for i := 0; s.Scan(); i++ { + if i < 2 { + continue + } + fields := strings.Fields(s.Text()) + major, err := strconv.Atoi(fields[0]) + if err != nil { + return nil, err + } + minor, err := strconv.Atoi(fields[1]) + if err != nil { + return nil, err + } + key := deviceKey{ + major: uint64(major), + minor: uint64(minor), + } + if _, ok := devices[key]; ok { + continue + } + devices[key] = filepath.Join("/dev", fields[3]) + } + return devices, s.Err() +} diff --git a/vendor/github.com/containerd/cgroups/cgroup.go b/vendor/github.com/containerd/cgroups/cgroup.go new file mode 100644 index 0000000000000..c0eac5bf19d1b --- /dev/null +++ b/vendor/github.com/containerd/cgroups/cgroup.go @@ -0,0 +1,543 @@ +/* + Copyright The containerd Authors. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package cgroups + +import ( + "errors" + "fmt" + "os" + "path/filepath" + "strconv" + "strings" + "sync" + + v1 "github.com/containerd/cgroups/stats/v1" + + "github.com/opencontainers/runtime-spec/specs-go" +) + +// New returns a new control via the cgroup cgroups interface +func New(hierarchy Hierarchy, path Path, resources *specs.LinuxResources, opts ...InitOpts) (Cgroup, error) { + config := newInitConfig() + for _, o := range opts { + if err := o(config); err != nil { + return nil, err + } + } + subsystems, err := hierarchy() + if err != nil { + return nil, err + } + var active []Subsystem + for _, s := range subsystems { + // check if subsystem exists + if err := initializeSubsystem(s, path, resources); err != nil { + if err == ErrControllerNotActive { + if config.InitCheck != nil { + if skerr := config.InitCheck(s, path, err); skerr != nil { + if skerr != ErrIgnoreSubsystem { + return nil, skerr + } + } + } + continue + } + return nil, err + } + active = append(active, s) + } + return &cgroup{ + path: path, + subsystems: active, + }, nil +} + +// Load will load an existing cgroup and allow it to be controlled +// All static path should not include `/sys/fs/cgroup/` prefix, it should start with your own cgroups name +func Load(hierarchy Hierarchy, path Path, opts ...InitOpts) (Cgroup, error) { + config := newInitConfig() + for _, o := range opts { + if err := o(config); err != nil { + return nil, err + } + } + var activeSubsystems []Subsystem + subsystems, err := hierarchy() + if err != nil { + return nil, err + } + // check that the subsystems still exist, and keep only those that actually exist + for _, s := range pathers(subsystems) { + p, err := path(s.Name()) + if err != nil { + if errors.Is(err, os.ErrNotExist) { + return nil, ErrCgroupDeleted + } + if err == ErrControllerNotActive { + if config.InitCheck != nil { + if skerr := config.InitCheck(s, path, err); skerr != nil { + if skerr != ErrIgnoreSubsystem { + return nil, skerr + } + } + } + continue + } + return nil, err + } + if _, err := os.Lstat(s.Path(p)); err != nil { + if os.IsNotExist(err) { + continue + } + return nil, err + } + activeSubsystems = append(activeSubsystems, s) + } + // if we do not have any active systems then the cgroup is deleted + if len(activeSubsystems) == 0 { + return nil, ErrCgroupDeleted + } + return &cgroup{ + path: path, + subsystems: activeSubsystems, + }, nil +} + +type cgroup struct { + path Path + + subsystems []Subsystem + mu sync.Mutex + err error +} + +// New returns a new sub cgroup +func (c *cgroup) New(name string, resources *specs.LinuxResources) (Cgroup, error) { + c.mu.Lock() + defer c.mu.Unlock() + if c.err != nil { + return nil, c.err + } + path := subPath(c.path, name) + for _, s := range c.subsystems { + if err := initializeSubsystem(s, path, resources); err != nil { + return nil, err + } + } + return &cgroup{ + path: path, + subsystems: c.subsystems, + }, nil +} + +// Subsystems returns all the subsystems that are currently being +// consumed by the group +func (c *cgroup) Subsystems() []Subsystem { + return c.subsystems +} + +func (c *cgroup) subsystemsFilter(subsystems ...Name) []Subsystem { + if len(subsystems) == 0 { + return c.subsystems + } + + var filteredSubsystems = []Subsystem{} + for _, s := range c.subsystems { + for _, f := range subsystems { + if s.Name() == f { + filteredSubsystems = append(filteredSubsystems, s) + break + } + } + } + + return filteredSubsystems +} + +// Add moves the provided process into the new cgroup. +// Without additional arguments, the process is added to all the cgroup subsystems. +// When giving Add a list of subsystem names, the process is only added to those +// subsystems, provided that they are active in the targeted cgroup. +func (c *cgroup) Add(process Process, subsystems ...Name) error { + return c.add(process, cgroupProcs, subsystems...) +} + +// AddProc moves the provided process id into the new cgroup. +// Without additional arguments, the process with the given id is added to all +// the cgroup subsystems. When giving AddProc a list of subsystem names, the process +// id is only added to those subsystems, provided that they are active in the targeted +// cgroup. +func (c *cgroup) AddProc(pid uint64, subsystems ...Name) error { + return c.add(Process{Pid: int(pid)}, cgroupProcs, subsystems...) +} + +// AddTask moves the provided tasks (threads) into the new cgroup. +// Without additional arguments, the task is added to all the cgroup subsystems. +// When giving AddTask a list of subsystem names, the task is only added to those +// subsystems, provided that they are active in the targeted cgroup. +func (c *cgroup) AddTask(process Process, subsystems ...Name) error { + return c.add(process, cgroupTasks, subsystems...) +} + +func (c *cgroup) add(process Process, pType procType, subsystems ...Name) error { + if process.Pid <= 0 { + return ErrInvalidPid + } + c.mu.Lock() + defer c.mu.Unlock() + if c.err != nil { + return c.err + } + for _, s := range pathers(c.subsystemsFilter(subsystems...)) { + p, err := c.path(s.Name()) + if err != nil { + return err + } + err = retryingWriteFile( + filepath.Join(s.Path(p), pType), + []byte(strconv.Itoa(process.Pid)), + defaultFilePerm, + ) + if err != nil { + return err + } + } + return nil +} + +// Delete will remove the control group from each of the subsystems registered +func (c *cgroup) Delete() error { + c.mu.Lock() + defer c.mu.Unlock() + if c.err != nil { + return c.err + } + var errs []string + for _, s := range c.subsystems { + // kernel prevents cgroups with running process from being removed, check the tree is empty + procs, err := c.processes(s.Name(), true, cgroupProcs) + if err != nil { + return err + } + if len(procs) > 0 { + errs = append(errs, fmt.Sprintf("%s (contains running processes)", string(s.Name()))) + continue + } + if d, ok := s.(deleter); ok { + sp, err := c.path(s.Name()) + if err != nil { + return err + } + if err := d.Delete(sp); err != nil { + errs = append(errs, string(s.Name())) + } + continue + } + if p, ok := s.(pather); ok { + sp, err := c.path(s.Name()) + if err != nil { + return err + } + path := p.Path(sp) + if err := remove(path); err != nil { + errs = append(errs, path) + } + continue + } + } + if len(errs) > 0 { + return fmt.Errorf("cgroups: unable to remove paths %s", strings.Join(errs, ", ")) + } + c.err = ErrCgroupDeleted + return nil +} + +// Stat returns the current metrics for the cgroup +func (c *cgroup) Stat(handlers ...ErrorHandler) (*v1.Metrics, error) { + c.mu.Lock() + defer c.mu.Unlock() + if c.err != nil { + return nil, c.err + } + if len(handlers) == 0 { + handlers = append(handlers, errPassthrough) + } + var ( + stats = &v1.Metrics{ + CPU: &v1.CPUStat{ + Throttling: &v1.Throttle{}, + Usage: &v1.CPUUsage{}, + }, + } + wg = &sync.WaitGroup{} + errs = make(chan error, len(c.subsystems)) + ) + for _, s := range c.subsystems { + if ss, ok := s.(stater); ok { + sp, err := c.path(s.Name()) + if err != nil { + return nil, err + } + wg.Add(1) + go func() { + defer wg.Done() + if err := ss.Stat(sp, stats); err != nil { + for _, eh := range handlers { + if herr := eh(err); herr != nil { + errs <- herr + } + } + } + }() + } + } + wg.Wait() + close(errs) + for err := range errs { + return nil, err + } + return stats, nil +} + +// Update updates the cgroup with the new resource values provided +// +// Be prepared to handle EBUSY when trying to update a cgroup with +// live processes and other operations like Stats being performed at the +// same time +func (c *cgroup) Update(resources *specs.LinuxResources) error { + c.mu.Lock() + defer c.mu.Unlock() + if c.err != nil { + return c.err + } + for _, s := range c.subsystems { + if u, ok := s.(updater); ok { + sp, err := c.path(s.Name()) + if err != nil { + return err + } + if err := u.Update(sp, resources); err != nil { + return err + } + } + } + return nil +} + +// Processes returns the processes running inside the cgroup along +// with the subsystem used, pid, and path +func (c *cgroup) Processes(subsystem Name, recursive bool) ([]Process, error) { + c.mu.Lock() + defer c.mu.Unlock() + if c.err != nil { + return nil, c.err + } + return c.processes(subsystem, recursive, cgroupProcs) +} + +// Tasks returns the tasks running inside the cgroup along +// with the subsystem used, pid, and path +func (c *cgroup) Tasks(subsystem Name, recursive bool) ([]Task, error) { + c.mu.Lock() + defer c.mu.Unlock() + if c.err != nil { + return nil, c.err + } + return c.processes(subsystem, recursive, cgroupTasks) +} + +func (c *cgroup) processes(subsystem Name, recursive bool, pType procType) ([]Process, error) { + s := c.getSubsystem(subsystem) + sp, err := c.path(subsystem) + if err != nil { + return nil, err + } + if s == nil { + return nil, fmt.Errorf("cgroups: %s doesn't exist in %s subsystem", sp, subsystem) + } + path := s.(pather).Path(sp) + var processes []Process + err = filepath.Walk(path, func(p string, info os.FileInfo, err error) error { + if err != nil { + return err + } + if !recursive && info.IsDir() { + if p == path { + return nil + } + return filepath.SkipDir + } + dir, name := filepath.Split(p) + if name != pType { + return nil + } + procs, err := readPids(dir, subsystem, pType) + if err != nil { + return err + } + processes = append(processes, procs...) + return nil + }) + return processes, err +} + +// Freeze freezes the entire cgroup and all the processes inside it +func (c *cgroup) Freeze() error { + c.mu.Lock() + defer c.mu.Unlock() + if c.err != nil { + return c.err + } + s := c.getSubsystem(Freezer) + if s == nil { + return ErrFreezerNotSupported + } + sp, err := c.path(Freezer) + if err != nil { + return err + } + return s.(*freezerController).Freeze(sp) +} + +// Thaw thaws out the cgroup and all the processes inside it +func (c *cgroup) Thaw() error { + c.mu.Lock() + defer c.mu.Unlock() + if c.err != nil { + return c.err + } + s := c.getSubsystem(Freezer) + if s == nil { + return ErrFreezerNotSupported + } + sp, err := c.path(Freezer) + if err != nil { + return err + } + return s.(*freezerController).Thaw(sp) +} + +// OOMEventFD returns the memory cgroup's out of memory event fd that triggers +// when processes inside the cgroup receive an oom event. Returns +// ErrMemoryNotSupported if memory cgroups is not supported. +func (c *cgroup) OOMEventFD() (uintptr, error) { + c.mu.Lock() + defer c.mu.Unlock() + if c.err != nil { + return 0, c.err + } + s := c.getSubsystem(Memory) + if s == nil { + return 0, ErrMemoryNotSupported + } + sp, err := c.path(Memory) + if err != nil { + return 0, err + } + return s.(*memoryController).memoryEvent(sp, OOMEvent()) +} + +// RegisterMemoryEvent allows the ability to register for all v1 memory cgroups +// notifications. +func (c *cgroup) RegisterMemoryEvent(event MemoryEvent) (uintptr, error) { + c.mu.Lock() + defer c.mu.Unlock() + if c.err != nil { + return 0, c.err + } + s := c.getSubsystem(Memory) + if s == nil { + return 0, ErrMemoryNotSupported + } + sp, err := c.path(Memory) + if err != nil { + return 0, err + } + return s.(*memoryController).memoryEvent(sp, event) +} + +// State returns the state of the cgroup and its processes +func (c *cgroup) State() State { + c.mu.Lock() + defer c.mu.Unlock() + c.checkExists() + if c.err != nil && c.err == ErrCgroupDeleted { + return Deleted + } + s := c.getSubsystem(Freezer) + if s == nil { + return Thawed + } + sp, err := c.path(Freezer) + if err != nil { + return Unknown + } + state, err := s.(*freezerController).state(sp) + if err != nil { + return Unknown + } + return state +} + +// MoveTo does a recursive move subsystem by subsystem of all the processes +// inside the group +func (c *cgroup) MoveTo(destination Cgroup) error { + c.mu.Lock() + defer c.mu.Unlock() + if c.err != nil { + return c.err + } + for _, s := range c.subsystems { + processes, err := c.processes(s.Name(), true, cgroupProcs) + if err != nil { + return err + } + for _, p := range processes { + if err := destination.Add(p); err != nil { + if strings.Contains(err.Error(), "no such process") { + continue + } + return err + } + } + } + return nil +} + +func (c *cgroup) getSubsystem(n Name) Subsystem { + for _, s := range c.subsystems { + if s.Name() == n { + return s + } + } + return nil +} + +func (c *cgroup) checkExists() { + for _, s := range pathers(c.subsystems) { + p, err := c.path(s.Name()) + if err != nil { + return + } + if _, err := os.Lstat(s.Path(p)); err != nil { + if os.IsNotExist(err) { + c.err = ErrCgroupDeleted + return + } + } + } +} diff --git a/vendor/github.com/containerd/cgroups/control.go b/vendor/github.com/containerd/cgroups/control.go new file mode 100644 index 0000000000000..5fcaac6d0566c --- /dev/null +++ b/vendor/github.com/containerd/cgroups/control.go @@ -0,0 +1,99 @@ +/* + Copyright The containerd Authors. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package cgroups + +import ( + "os" + + v1 "github.com/containerd/cgroups/stats/v1" + specs "github.com/opencontainers/runtime-spec/specs-go" +) + +type procType = string + +const ( + cgroupProcs procType = "cgroup.procs" + cgroupTasks procType = "tasks" + defaultDirPerm = 0755 +) + +// defaultFilePerm is a var so that the test framework can change the filemode +// of all files created when the tests are running. The difference between the +// tests and real world use is that files like "cgroup.procs" will exist when writing +// to a read cgroup filesystem and do not exist prior when running in the tests. +// this is set to a non 0 value in the test code +var defaultFilePerm = os.FileMode(0) + +type Process struct { + // Subsystem is the name of the subsystem that the process / task is in. + Subsystem Name + // Pid is the process id of the process / task. + Pid int + // Path is the full path of the subsystem and location that the process / task is in. + Path string +} + +type Task = Process + +// Cgroup handles interactions with the individual groups to perform +// actions on them as them main interface to this cgroup package +type Cgroup interface { + // New creates a new cgroup under the calling cgroup + New(string, *specs.LinuxResources) (Cgroup, error) + // Add adds a process to the cgroup (cgroup.procs). Without additional arguments, + // the process is added to all the cgroup subsystems. When giving Add a list of + // subsystem names, the process is only added to those subsystems, provided that + // they are active in the targeted cgroup. + Add(Process, ...Name) error + // AddProc adds the process with the given id to the cgroup (cgroup.procs). + // Without additional arguments, the process with the given id is added to all + // the cgroup subsystems. When giving AddProc a list of subsystem names, the process + // id is only added to those subsystems, provided that they are active in the targeted + // cgroup. + AddProc(uint64, ...Name) error + // AddTask adds a process to the cgroup (tasks). Without additional arguments, the + // task is added to all the cgroup subsystems. When giving AddTask a list of subsystem + // names, the task is only added to those subsystems, provided that they are active in + // the targeted cgroup. + AddTask(Process, ...Name) error + // Delete removes the cgroup as a whole + Delete() error + // MoveTo moves all the processes under the calling cgroup to the provided one + // subsystems are moved one at a time + MoveTo(Cgroup) error + // Stat returns the stats for all subsystems in the cgroup + Stat(...ErrorHandler) (*v1.Metrics, error) + // Update updates all the subsystems with the provided resource changes + Update(resources *specs.LinuxResources) error + // Processes returns all the processes in a select subsystem for the cgroup + Processes(Name, bool) ([]Process, error) + // Tasks returns all the tasks in a select subsystem for the cgroup + Tasks(Name, bool) ([]Task, error) + // Freeze freezes or pauses all processes inside the cgroup + Freeze() error + // Thaw thaw or resumes all processes inside the cgroup + Thaw() error + // OOMEventFD returns the memory subsystem's event fd for OOM events + OOMEventFD() (uintptr, error) + // RegisterMemoryEvent returns the memory subsystems event fd for whatever memory event was + // registered for. Can alternatively register for the oom event with this method. + RegisterMemoryEvent(MemoryEvent) (uintptr, error) + // State returns the cgroups current state + State() State + // Subsystems returns all the subsystems in the cgroup + Subsystems() []Subsystem +} diff --git a/vendor/github.com/containerd/cgroups/cpu.go b/vendor/github.com/containerd/cgroups/cpu.go new file mode 100644 index 0000000000000..27024f17b826c --- /dev/null +++ b/vendor/github.com/containerd/cgroups/cpu.go @@ -0,0 +1,125 @@ +/* + Copyright The containerd Authors. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package cgroups + +import ( + "bufio" + "os" + "path/filepath" + "strconv" + + v1 "github.com/containerd/cgroups/stats/v1" + specs "github.com/opencontainers/runtime-spec/specs-go" +) + +func NewCpu(root string) *cpuController { + return &cpuController{ + root: filepath.Join(root, string(Cpu)), + } +} + +type cpuController struct { + root string +} + +func (c *cpuController) Name() Name { + return Cpu +} + +func (c *cpuController) Path(path string) string { + return filepath.Join(c.root, path) +} + +func (c *cpuController) Create(path string, resources *specs.LinuxResources) error { + if err := os.MkdirAll(c.Path(path), defaultDirPerm); err != nil { + return err + } + if cpu := resources.CPU; cpu != nil { + for _, t := range []struct { + name string + ivalue *int64 + uvalue *uint64 + }{ + { + name: "rt_period_us", + uvalue: cpu.RealtimePeriod, + }, + { + name: "rt_runtime_us", + ivalue: cpu.RealtimeRuntime, + }, + { + name: "shares", + uvalue: cpu.Shares, + }, + { + name: "cfs_period_us", + uvalue: cpu.Period, + }, + { + name: "cfs_quota_us", + ivalue: cpu.Quota, + }, + } { + var value []byte + if t.uvalue != nil { + value = []byte(strconv.FormatUint(*t.uvalue, 10)) + } else if t.ivalue != nil { + value = []byte(strconv.FormatInt(*t.ivalue, 10)) + } + if value != nil { + if err := retryingWriteFile( + filepath.Join(c.Path(path), "cpu."+t.name), + value, + defaultFilePerm, + ); err != nil { + return err + } + } + } + } + return nil +} + +func (c *cpuController) Update(path string, resources *specs.LinuxResources) error { + return c.Create(path, resources) +} + +func (c *cpuController) Stat(path string, stats *v1.Metrics) error { + f, err := os.Open(filepath.Join(c.Path(path), "cpu.stat")) + if err != nil { + return err + } + defer f.Close() + // get or create the cpu field because cpuacct can also set values on this struct + sc := bufio.NewScanner(f) + for sc.Scan() { + key, v, err := parseKV(sc.Text()) + if err != nil { + return err + } + switch key { + case "nr_periods": + stats.CPU.Throttling.Periods = v + case "nr_throttled": + stats.CPU.Throttling.ThrottledPeriods = v + case "throttled_time": + stats.CPU.Throttling.ThrottledTime = v + } + } + return sc.Err() +} diff --git a/vendor/github.com/containerd/cgroups/cpuacct.go b/vendor/github.com/containerd/cgroups/cpuacct.go new file mode 100644 index 0000000000000..1022fa379b597 --- /dev/null +++ b/vendor/github.com/containerd/cgroups/cpuacct.go @@ -0,0 +1,129 @@ +/* + Copyright The containerd Authors. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package cgroups + +import ( + "bufio" + "fmt" + "os" + "path/filepath" + "strconv" + "strings" + + v1 "github.com/containerd/cgroups/stats/v1" +) + +const nanosecondsInSecond = 1000000000 + +var clockTicks = getClockTicks() + +func NewCpuacct(root string) *cpuacctController { + return &cpuacctController{ + root: filepath.Join(root, string(Cpuacct)), + } +} + +type cpuacctController struct { + root string +} + +func (c *cpuacctController) Name() Name { + return Cpuacct +} + +func (c *cpuacctController) Path(path string) string { + return filepath.Join(c.root, path) +} + +func (c *cpuacctController) Stat(path string, stats *v1.Metrics) error { + user, kernel, err := c.getUsage(path) + if err != nil { + return err + } + total, err := readUint(filepath.Join(c.Path(path), "cpuacct.usage")) + if err != nil { + return err + } + percpu, err := c.percpuUsage(path) + if err != nil { + return err + } + stats.CPU.Usage.Total = total + stats.CPU.Usage.User = user + stats.CPU.Usage.Kernel = kernel + stats.CPU.Usage.PerCPU = percpu + return nil +} + +func (c *cpuacctController) percpuUsage(path string) ([]uint64, error) { + var usage []uint64 + data, err := os.ReadFile(filepath.Join(c.Path(path), "cpuacct.usage_percpu")) + if err != nil { + return nil, err + } + for _, v := range strings.Fields(string(data)) { + u, err := strconv.ParseUint(v, 10, 64) + if err != nil { + return nil, err + } + usage = append(usage, u) + } + return usage, nil +} + +func (c *cpuacctController) getUsage(path string) (user uint64, kernel uint64, err error) { + statPath := filepath.Join(c.Path(path), "cpuacct.stat") + f, err := os.Open(statPath) + if err != nil { + return 0, 0, err + } + defer f.Close() + var ( + raw = make(map[string]uint64) + sc = bufio.NewScanner(f) + ) + for sc.Scan() { + key, v, err := parseKV(sc.Text()) + if err != nil { + return 0, 0, err + } + raw[key] = v + } + if err := sc.Err(); err != nil { + return 0, 0, err + } + for _, t := range []struct { + name string + value *uint64 + }{ + { + name: "user", + value: &user, + }, + { + name: "system", + value: &kernel, + }, + } { + v, ok := raw[t.name] + if !ok { + return 0, 0, fmt.Errorf("expected field %q but not found in %q", t.name, statPath) + } + *t.value = v + } + return (user * nanosecondsInSecond) / clockTicks, (kernel * nanosecondsInSecond) / clockTicks, nil +} diff --git a/vendor/github.com/containerd/cgroups/cpuset.go b/vendor/github.com/containerd/cgroups/cpuset.go new file mode 100644 index 0000000000000..8b56d3dbaa001 --- /dev/null +++ b/vendor/github.com/containerd/cgroups/cpuset.go @@ -0,0 +1,158 @@ +/* + Copyright The containerd Authors. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package cgroups + +import ( + "bytes" + "fmt" + "os" + "path/filepath" + + specs "github.com/opencontainers/runtime-spec/specs-go" +) + +func NewCpuset(root string) *cpusetController { + return &cpusetController{ + root: filepath.Join(root, string(Cpuset)), + } +} + +type cpusetController struct { + root string +} + +func (c *cpusetController) Name() Name { + return Cpuset +} + +func (c *cpusetController) Path(path string) string { + return filepath.Join(c.root, path) +} + +func (c *cpusetController) Create(path string, resources *specs.LinuxResources) error { + if err := c.ensureParent(c.Path(path), c.root); err != nil { + return err + } + if err := os.MkdirAll(c.Path(path), defaultDirPerm); err != nil { + return err + } + if err := c.copyIfNeeded(c.Path(path), filepath.Dir(c.Path(path))); err != nil { + return err + } + if resources.CPU != nil { + for _, t := range []struct { + name string + value string + }{ + { + name: "cpus", + value: resources.CPU.Cpus, + }, + { + name: "mems", + value: resources.CPU.Mems, + }, + } { + if t.value != "" { + if err := retryingWriteFile( + filepath.Join(c.Path(path), "cpuset."+t.name), + []byte(t.value), + defaultFilePerm, + ); err != nil { + return err + } + } + } + } + return nil +} + +func (c *cpusetController) Update(path string, resources *specs.LinuxResources) error { + return c.Create(path, resources) +} + +func (c *cpusetController) getValues(path string) (cpus []byte, mems []byte, err error) { + if cpus, err = os.ReadFile(filepath.Join(path, "cpuset.cpus")); err != nil && !os.IsNotExist(err) { + return + } + if mems, err = os.ReadFile(filepath.Join(path, "cpuset.mems")); err != nil && !os.IsNotExist(err) { + return + } + return cpus, mems, nil +} + +// ensureParent makes sure that the parent directory of current is created +// and populated with the proper cpus and mems files copied from +// it's parent. +func (c *cpusetController) ensureParent(current, root string) error { + parent := filepath.Dir(current) + if _, err := filepath.Rel(root, parent); err != nil { + return nil + } + // Avoid infinite recursion. + if parent == current { + return fmt.Errorf("cpuset: cgroup parent path outside cgroup root") + } + if cleanPath(parent) != root { + if err := c.ensureParent(parent, root); err != nil { + return err + } + } + if err := os.MkdirAll(current, defaultDirPerm); err != nil { + return err + } + return c.copyIfNeeded(current, parent) +} + +// copyIfNeeded copies the cpuset.cpus and cpuset.mems from the parent +// directory to the current directory if the file's contents are 0 +func (c *cpusetController) copyIfNeeded(current, parent string) error { + var ( + err error + currentCpus, currentMems []byte + parentCpus, parentMems []byte + ) + if currentCpus, currentMems, err = c.getValues(current); err != nil { + return err + } + if parentCpus, parentMems, err = c.getValues(parent); err != nil { + return err + } + if isEmpty(currentCpus) { + if err := retryingWriteFile( + filepath.Join(current, "cpuset.cpus"), + parentCpus, + defaultFilePerm, + ); err != nil { + return err + } + } + if isEmpty(currentMems) { + if err := retryingWriteFile( + filepath.Join(current, "cpuset.mems"), + parentMems, + defaultFilePerm, + ); err != nil { + return err + } + } + return nil +} + +func isEmpty(b []byte) bool { + return len(bytes.Trim(b, "\n")) == 0 +} diff --git a/vendor/github.com/containerd/cgroups/devices.go b/vendor/github.com/containerd/cgroups/devices.go new file mode 100644 index 0000000000000..7792566d5ee40 --- /dev/null +++ b/vendor/github.com/containerd/cgroups/devices.go @@ -0,0 +1,92 @@ +/* + Copyright The containerd Authors. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package cgroups + +import ( + "fmt" + "os" + "path/filepath" + + specs "github.com/opencontainers/runtime-spec/specs-go" +) + +const ( + allowDeviceFile = "devices.allow" + denyDeviceFile = "devices.deny" + wildcard = -1 +) + +func NewDevices(root string) *devicesController { + return &devicesController{ + root: filepath.Join(root, string(Devices)), + } +} + +type devicesController struct { + root string +} + +func (d *devicesController) Name() Name { + return Devices +} + +func (d *devicesController) Path(path string) string { + return filepath.Join(d.root, path) +} + +func (d *devicesController) Create(path string, resources *specs.LinuxResources) error { + if err := os.MkdirAll(d.Path(path), defaultDirPerm); err != nil { + return err + } + for _, device := range resources.Devices { + file := denyDeviceFile + if device.Allow { + file = allowDeviceFile + } + if device.Type == "" { + device.Type = "a" + } + if err := retryingWriteFile( + filepath.Join(d.Path(path), file), + []byte(deviceString(device)), + defaultFilePerm, + ); err != nil { + return err + } + } + return nil +} + +func (d *devicesController) Update(path string, resources *specs.LinuxResources) error { + return d.Create(path, resources) +} + +func deviceString(device specs.LinuxDeviceCgroup) string { + return fmt.Sprintf("%s %s:%s %s", + device.Type, + deviceNumber(device.Major), + deviceNumber(device.Minor), + device.Access, + ) +} + +func deviceNumber(number *int64) string { + if number == nil || *number == wildcard { + return "*" + } + return fmt.Sprint(*number) +} diff --git a/vendor/github.com/containerd/cgroups/errors.go b/vendor/github.com/containerd/cgroups/errors.go new file mode 100644 index 0000000000000..f1ad8315c81e0 --- /dev/null +++ b/vendor/github.com/containerd/cgroups/errors.go @@ -0,0 +1,47 @@ +/* + Copyright The containerd Authors. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package cgroups + +import ( + "errors" + "os" +) + +var ( + ErrInvalidPid = errors.New("cgroups: pid must be greater than 0") + ErrMountPointNotExist = errors.New("cgroups: cgroup mountpoint does not exist") + ErrInvalidFormat = errors.New("cgroups: parsing file with invalid format failed") + ErrFreezerNotSupported = errors.New("cgroups: freezer cgroup not supported on this system") + ErrMemoryNotSupported = errors.New("cgroups: memory cgroup not supported on this system") + ErrCgroupDeleted = errors.New("cgroups: cgroup deleted") + ErrNoCgroupMountDestination = errors.New("cgroups: cannot find cgroup mount destination") +) + +// ErrorHandler is a function that handles and acts on errors +type ErrorHandler func(err error) error + +// IgnoreNotExist ignores any errors that are for not existing files +func IgnoreNotExist(err error) error { + if os.IsNotExist(err) { + return nil + } + return err +} + +func errPassthrough(err error) error { + return err +} diff --git a/vendor/github.com/containerd/cgroups/freezer.go b/vendor/github.com/containerd/cgroups/freezer.go new file mode 100644 index 0000000000000..5783f0dcc741a --- /dev/null +++ b/vendor/github.com/containerd/cgroups/freezer.go @@ -0,0 +1,82 @@ +/* + Copyright The containerd Authors. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package cgroups + +import ( + "os" + "path/filepath" + "strings" + "time" +) + +func NewFreezer(root string) *freezerController { + return &freezerController{ + root: filepath.Join(root, string(Freezer)), + } +} + +type freezerController struct { + root string +} + +func (f *freezerController) Name() Name { + return Freezer +} + +func (f *freezerController) Path(path string) string { + return filepath.Join(f.root, path) +} + +func (f *freezerController) Freeze(path string) error { + return f.waitState(path, Frozen) +} + +func (f *freezerController) Thaw(path string) error { + return f.waitState(path, Thawed) +} + +func (f *freezerController) changeState(path string, state State) error { + return retryingWriteFile( + filepath.Join(f.root, path, "freezer.state"), + []byte(strings.ToUpper(string(state))), + defaultFilePerm, + ) +} + +func (f *freezerController) state(path string) (State, error) { + current, err := os.ReadFile(filepath.Join(f.root, path, "freezer.state")) + if err != nil { + return "", err + } + return State(strings.ToLower(strings.TrimSpace(string(current)))), nil +} + +func (f *freezerController) waitState(path string, state State) error { + for { + if err := f.changeState(path, state); err != nil { + return err + } + current, err := f.state(path) + if err != nil { + return err + } + if current == state { + return nil + } + time.Sleep(1 * time.Millisecond) + } +} diff --git a/vendor/github.com/containerd/cgroups/hierarchy.go b/vendor/github.com/containerd/cgroups/hierarchy.go new file mode 100644 index 0000000000000..ca3f1b9380b75 --- /dev/null +++ b/vendor/github.com/containerd/cgroups/hierarchy.go @@ -0,0 +1,20 @@ +/* + Copyright The containerd Authors. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package cgroups + +// Hierarchy enables both unified and split hierarchy for cgroups +type Hierarchy func() ([]Subsystem, error) diff --git a/vendor/github.com/containerd/cgroups/hugetlb.go b/vendor/github.com/containerd/cgroups/hugetlb.go new file mode 100644 index 0000000000000..c0eb03b24da3a --- /dev/null +++ b/vendor/github.com/containerd/cgroups/hugetlb.go @@ -0,0 +1,109 @@ +/* + Copyright The containerd Authors. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package cgroups + +import ( + "os" + "path/filepath" + "strconv" + "strings" + + v1 "github.com/containerd/cgroups/stats/v1" + specs "github.com/opencontainers/runtime-spec/specs-go" +) + +func NewHugetlb(root string) (*hugetlbController, error) { + sizes, err := hugePageSizes() + if err != nil { + return nil, err + } + + return &hugetlbController{ + root: filepath.Join(root, string(Hugetlb)), + sizes: sizes, + }, nil +} + +type hugetlbController struct { + root string + sizes []string +} + +func (h *hugetlbController) Name() Name { + return Hugetlb +} + +func (h *hugetlbController) Path(path string) string { + return filepath.Join(h.root, path) +} + +func (h *hugetlbController) Create(path string, resources *specs.LinuxResources) error { + if err := os.MkdirAll(h.Path(path), defaultDirPerm); err != nil { + return err + } + for _, limit := range resources.HugepageLimits { + if err := retryingWriteFile( + filepath.Join(h.Path(path), strings.Join([]string{"hugetlb", limit.Pagesize, "limit_in_bytes"}, ".")), + []byte(strconv.FormatUint(limit.Limit, 10)), + defaultFilePerm, + ); err != nil { + return err + } + } + return nil +} + +func (h *hugetlbController) Stat(path string, stats *v1.Metrics) error { + for _, size := range h.sizes { + s, err := h.readSizeStat(path, size) + if err != nil { + return err + } + stats.Hugetlb = append(stats.Hugetlb, s) + } + return nil +} + +func (h *hugetlbController) readSizeStat(path, size string) (*v1.HugetlbStat, error) { + s := v1.HugetlbStat{ + Pagesize: size, + } + for _, t := range []struct { + name string + value *uint64 + }{ + { + name: "usage_in_bytes", + value: &s.Usage, + }, + { + name: "max_usage_in_bytes", + value: &s.Max, + }, + { + name: "failcnt", + value: &s.Failcnt, + }, + } { + v, err := readUint(filepath.Join(h.Path(path), strings.Join([]string{"hugetlb", size, t.name}, "."))) + if err != nil { + return nil, err + } + *t.value = v + } + return &s, nil +} diff --git a/vendor/github.com/containerd/cgroups/memory.go b/vendor/github.com/containerd/cgroups/memory.go new file mode 100644 index 0000000000000..e271866ef94fc --- /dev/null +++ b/vendor/github.com/containerd/cgroups/memory.go @@ -0,0 +1,480 @@ +/* + Copyright The containerd Authors. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package cgroups + +import ( + "bufio" + "fmt" + "io" + "os" + "path/filepath" + "strconv" + "strings" + + v1 "github.com/containerd/cgroups/stats/v1" + specs "github.com/opencontainers/runtime-spec/specs-go" + "golang.org/x/sys/unix" +) + +// MemoryEvent is an interface that V1 memory Cgroup notifications implement. Arg returns the +// file name whose fd should be written to "cgroups.event_control". EventFile returns the name of +// the file that supports the notification api e.g. "memory.usage_in_bytes". +type MemoryEvent interface { + Arg() string + EventFile() string +} + +type memoryThresholdEvent struct { + threshold uint64 + swap bool +} + +// MemoryThresholdEvent returns a new memory threshold event to be used with RegisterMemoryEvent. +// If swap is true, the event will be registered using memory.memsw.usage_in_bytes +func MemoryThresholdEvent(threshold uint64, swap bool) MemoryEvent { + return &memoryThresholdEvent{ + threshold, + swap, + } +} + +func (m *memoryThresholdEvent) Arg() string { + return strconv.FormatUint(m.threshold, 10) +} + +func (m *memoryThresholdEvent) EventFile() string { + if m.swap { + return "memory.memsw.usage_in_bytes" + } + return "memory.usage_in_bytes" +} + +type oomEvent struct{} + +// OOMEvent returns a new oom event to be used with RegisterMemoryEvent. +func OOMEvent() MemoryEvent { + return &oomEvent{} +} + +func (oom *oomEvent) Arg() string { + return "" +} + +func (oom *oomEvent) EventFile() string { + return "memory.oom_control" +} + +type memoryPressureEvent struct { + pressureLevel MemoryPressureLevel + hierarchy EventNotificationMode +} + +// MemoryPressureEvent returns a new memory pressure event to be used with RegisterMemoryEvent. +func MemoryPressureEvent(pressureLevel MemoryPressureLevel, hierarchy EventNotificationMode) MemoryEvent { + return &memoryPressureEvent{ + pressureLevel, + hierarchy, + } +} + +func (m *memoryPressureEvent) Arg() string { + return string(m.pressureLevel) + "," + string(m.hierarchy) +} + +func (m *memoryPressureEvent) EventFile() string { + return "memory.pressure_level" +} + +// MemoryPressureLevel corresponds to the memory pressure levels defined +// for memory cgroups. +type MemoryPressureLevel string + +// The three memory pressure levels are as follows. +// - The "low" level means that the system is reclaiming memory for new +// allocations. Monitoring this reclaiming activity might be useful for +// maintaining cache level. Upon notification, the program (typically +// "Activity Manager") might analyze vmstat and act in advance (i.e. +// prematurely shutdown unimportant services). +// - The "medium" level means that the system is experiencing medium memory +// pressure, the system might be making swap, paging out active file caches, +// etc. Upon this event applications may decide to further analyze +// vmstat/zoneinfo/memcg or internal memory usage statistics and free any +// resources that can be easily reconstructed or re-read from a disk. +// - The "critical" level means that the system is actively thrashing, it is +// about to out of memory (OOM) or even the in-kernel OOM killer is on its +// way to trigger. Applications should do whatever they can to help the +// system. It might be too late to consult with vmstat or any other +// statistics, so it is advisable to take an immediate action. +// "https://www.kernel.org/doc/Documentation/cgroup-v1/memory.txt" Section 11 +const ( + LowPressure MemoryPressureLevel = "low" + MediumPressure MemoryPressureLevel = "medium" + CriticalPressure MemoryPressureLevel = "critical" +) + +// EventNotificationMode corresponds to the notification modes +// for the memory cgroups pressure level notifications. +type EventNotificationMode string + +// There are three optional modes that specify different propagation behavior: +// - "default": this is the default behavior specified above. This mode is the +// same as omitting the optional mode parameter, preserved by backwards +// compatibility. +// - "hierarchy": events always propagate up to the root, similar to the default +// behavior, except that propagation continues regardless of whether there are +// event listeners at each level, with the "hierarchy" mode. In the above +// example, groups A, B, and C will receive notification of memory pressure. +// - "local": events are pass-through, i.e. they only receive notifications when +// memory pressure is experienced in the memcg for which the notification is +// registered. In the above example, group C will receive notification if +// registered for "local" notification and the group experiences memory +// pressure. However, group B will never receive notification, regardless if +// there is an event listener for group C or not, if group B is registered for +// local notification. +// "https://www.kernel.org/doc/Documentation/cgroup-v1/memory.txt" Section 11 +const ( + DefaultMode EventNotificationMode = "default" + LocalMode EventNotificationMode = "local" + HierarchyMode EventNotificationMode = "hierarchy" +) + +// NewMemory returns a Memory controller given the root folder of cgroups. +// It may optionally accept other configuration options, such as IgnoreModules(...) +func NewMemory(root string, options ...func(*memoryController)) *memoryController { + mc := &memoryController{ + root: filepath.Join(root, string(Memory)), + ignored: map[string]struct{}{}, + } + for _, opt := range options { + opt(mc) + } + return mc +} + +// IgnoreModules configure the memory controller to not read memory metrics for some +// module names (e.g. passing "memsw" would avoid all the memory.memsw.* entries) +func IgnoreModules(names ...string) func(*memoryController) { + return func(mc *memoryController) { + for _, name := range names { + mc.ignored[name] = struct{}{} + } + } +} + +// OptionalSwap allows the memory controller to not fail if cgroups is not accounting +// Swap memory (there are no memory.memsw.* entries) +func OptionalSwap() func(*memoryController) { + return func(mc *memoryController) { + _, err := os.Stat(filepath.Join(mc.root, "memory.memsw.usage_in_bytes")) + if os.IsNotExist(err) { + mc.ignored["memsw"] = struct{}{} + } + } +} + +type memoryController struct { + root string + ignored map[string]struct{} +} + +func (m *memoryController) Name() Name { + return Memory +} + +func (m *memoryController) Path(path string) string { + return filepath.Join(m.root, path) +} + +func (m *memoryController) Create(path string, resources *specs.LinuxResources) error { + if err := os.MkdirAll(m.Path(path), defaultDirPerm); err != nil { + return err + } + if resources.Memory == nil { + return nil + } + return m.set(path, getMemorySettings(resources)) +} + +func (m *memoryController) Update(path string, resources *specs.LinuxResources) error { + if resources.Memory == nil { + return nil + } + g := func(v *int64) bool { + return v != nil && *v > 0 + } + settings := getMemorySettings(resources) + if g(resources.Memory.Limit) && g(resources.Memory.Swap) { + // if the updated swap value is larger than the current memory limit set the swap changes first + // then set the memory limit as swap must always be larger than the current limit + current, err := readUint(filepath.Join(m.Path(path), "memory.limit_in_bytes")) + if err != nil { + return err + } + if current < uint64(*resources.Memory.Swap) { + settings[0], settings[1] = settings[1], settings[0] + } + } + return m.set(path, settings) +} + +func (m *memoryController) Stat(path string, stats *v1.Metrics) error { + fMemStat, err := os.Open(filepath.Join(m.Path(path), "memory.stat")) + if err != nil { + return err + } + defer fMemStat.Close() + stats.Memory = &v1.MemoryStat{ + Usage: &v1.MemoryEntry{}, + Swap: &v1.MemoryEntry{}, + Kernel: &v1.MemoryEntry{}, + KernelTCP: &v1.MemoryEntry{}, + } + if err := m.parseStats(fMemStat, stats.Memory); err != nil { + return err + } + + fMemOomControl, err := os.Open(filepath.Join(m.Path(path), "memory.oom_control")) + if err != nil { + return err + } + defer fMemOomControl.Close() + stats.MemoryOomControl = &v1.MemoryOomControl{} + if err := m.parseOomControlStats(fMemOomControl, stats.MemoryOomControl); err != nil { + return err + } + for _, t := range []struct { + module string + entry *v1.MemoryEntry + }{ + { + module: "", + entry: stats.Memory.Usage, + }, + { + module: "memsw", + entry: stats.Memory.Swap, + }, + { + module: "kmem", + entry: stats.Memory.Kernel, + }, + { + module: "kmem.tcp", + entry: stats.Memory.KernelTCP, + }, + } { + if _, ok := m.ignored[t.module]; ok { + continue + } + for _, tt := range []struct { + name string + value *uint64 + }{ + { + name: "usage_in_bytes", + value: &t.entry.Usage, + }, + { + name: "max_usage_in_bytes", + value: &t.entry.Max, + }, + { + name: "failcnt", + value: &t.entry.Failcnt, + }, + { + name: "limit_in_bytes", + value: &t.entry.Limit, + }, + } { + parts := []string{"memory"} + if t.module != "" { + parts = append(parts, t.module) + } + parts = append(parts, tt.name) + v, err := readUint(filepath.Join(m.Path(path), strings.Join(parts, "."))) + if err != nil { + return err + } + *tt.value = v + } + } + return nil +} + +func (m *memoryController) parseStats(r io.Reader, stat *v1.MemoryStat) error { + var ( + raw = make(map[string]uint64) + sc = bufio.NewScanner(r) + line int + ) + for sc.Scan() { + key, v, err := parseKV(sc.Text()) + if err != nil { + return fmt.Errorf("%d: %v", line, err) + } + raw[key] = v + line++ + } + if err := sc.Err(); err != nil { + return err + } + stat.Cache = raw["cache"] + stat.RSS = raw["rss"] + stat.RSSHuge = raw["rss_huge"] + stat.MappedFile = raw["mapped_file"] + stat.Dirty = raw["dirty"] + stat.Writeback = raw["writeback"] + stat.PgPgIn = raw["pgpgin"] + stat.PgPgOut = raw["pgpgout"] + stat.PgFault = raw["pgfault"] + stat.PgMajFault = raw["pgmajfault"] + stat.InactiveAnon = raw["inactive_anon"] + stat.ActiveAnon = raw["active_anon"] + stat.InactiveFile = raw["inactive_file"] + stat.ActiveFile = raw["active_file"] + stat.Unevictable = raw["unevictable"] + stat.HierarchicalMemoryLimit = raw["hierarchical_memory_limit"] + stat.HierarchicalSwapLimit = raw["hierarchical_memsw_limit"] + stat.TotalCache = raw["total_cache"] + stat.TotalRSS = raw["total_rss"] + stat.TotalRSSHuge = raw["total_rss_huge"] + stat.TotalMappedFile = raw["total_mapped_file"] + stat.TotalDirty = raw["total_dirty"] + stat.TotalWriteback = raw["total_writeback"] + stat.TotalPgPgIn = raw["total_pgpgin"] + stat.TotalPgPgOut = raw["total_pgpgout"] + stat.TotalPgFault = raw["total_pgfault"] + stat.TotalPgMajFault = raw["total_pgmajfault"] + stat.TotalInactiveAnon = raw["total_inactive_anon"] + stat.TotalActiveAnon = raw["total_active_anon"] + stat.TotalInactiveFile = raw["total_inactive_file"] + stat.TotalActiveFile = raw["total_active_file"] + stat.TotalUnevictable = raw["total_unevictable"] + return nil +} + +func (m *memoryController) parseOomControlStats(r io.Reader, stat *v1.MemoryOomControl) error { + var ( + raw = make(map[string]uint64) + sc = bufio.NewScanner(r) + line int + ) + for sc.Scan() { + key, v, err := parseKV(sc.Text()) + if err != nil { + return fmt.Errorf("%d: %v", line, err) + } + raw[key] = v + line++ + } + if err := sc.Err(); err != nil { + return err + } + stat.OomKillDisable = raw["oom_kill_disable"] + stat.UnderOom = raw["under_oom"] + stat.OomKill = raw["oom_kill"] + return nil +} + +func (m *memoryController) set(path string, settings []memorySettings) error { + for _, t := range settings { + if t.value != nil { + if err := retryingWriteFile( + filepath.Join(m.Path(path), "memory."+t.name), + []byte(strconv.FormatInt(*t.value, 10)), + defaultFilePerm, + ); err != nil { + return err + } + } + } + return nil +} + +type memorySettings struct { + name string + value *int64 +} + +func getMemorySettings(resources *specs.LinuxResources) []memorySettings { + mem := resources.Memory + var swappiness *int64 + if mem.Swappiness != nil { + v := int64(*mem.Swappiness) + swappiness = &v + } + return []memorySettings{ + { + name: "limit_in_bytes", + value: mem.Limit, + }, + { + name: "soft_limit_in_bytes", + value: mem.Reservation, + }, + { + name: "memsw.limit_in_bytes", + value: mem.Swap, + }, + { + name: "kmem.limit_in_bytes", + value: mem.Kernel, + }, + { + name: "kmem.tcp.limit_in_bytes", + value: mem.KernelTCP, + }, + { + name: "oom_control", + value: getOomControlValue(mem), + }, + { + name: "swappiness", + value: swappiness, + }, + } +} + +func getOomControlValue(mem *specs.LinuxMemory) *int64 { + if mem.DisableOOMKiller != nil && *mem.DisableOOMKiller { + i := int64(1) + return &i + } + return nil +} + +func (m *memoryController) memoryEvent(path string, event MemoryEvent) (uintptr, error) { + root := m.Path(path) + efd, err := unix.Eventfd(0, unix.EFD_CLOEXEC) + if err != nil { + return 0, err + } + evtFile, err := os.Open(filepath.Join(root, event.EventFile())) + if err != nil { + unix.Close(efd) + return 0, err + } + defer evtFile.Close() + data := fmt.Sprintf("%d %d %s", efd, evtFile.Fd(), event.Arg()) + evctlPath := filepath.Join(root, "cgroup.event_control") + if err := retryingWriteFile(evctlPath, []byte(data), 0700); err != nil { + unix.Close(efd) + return 0, err + } + return uintptr(efd), nil +} diff --git a/vendor/github.com/containerd/cgroups/named.go b/vendor/github.com/containerd/cgroups/named.go new file mode 100644 index 0000000000000..06b16c3b1573b --- /dev/null +++ b/vendor/github.com/containerd/cgroups/named.go @@ -0,0 +1,39 @@ +/* + Copyright The containerd Authors. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package cgroups + +import "path/filepath" + +func NewNamed(root string, name Name) *namedController { + return &namedController{ + root: root, + name: name, + } +} + +type namedController struct { + root string + name Name +} + +func (n *namedController) Name() Name { + return n.name +} + +func (n *namedController) Path(path string) string { + return filepath.Join(n.root, string(n.name), path) +} diff --git a/vendor/github.com/containerd/cgroups/net_cls.go b/vendor/github.com/containerd/cgroups/net_cls.go new file mode 100644 index 0000000000000..839b06de080b1 --- /dev/null +++ b/vendor/github.com/containerd/cgroups/net_cls.go @@ -0,0 +1,61 @@ +/* + Copyright The containerd Authors. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package cgroups + +import ( + "os" + "path/filepath" + "strconv" + + specs "github.com/opencontainers/runtime-spec/specs-go" +) + +func NewNetCls(root string) *netclsController { + return &netclsController{ + root: filepath.Join(root, string(NetCLS)), + } +} + +type netclsController struct { + root string +} + +func (n *netclsController) Name() Name { + return NetCLS +} + +func (n *netclsController) Path(path string) string { + return filepath.Join(n.root, path) +} + +func (n *netclsController) Create(path string, resources *specs.LinuxResources) error { + if err := os.MkdirAll(n.Path(path), defaultDirPerm); err != nil { + return err + } + if resources.Network != nil && resources.Network.ClassID != nil && *resources.Network.ClassID > 0 { + return retryingWriteFile( + filepath.Join(n.Path(path), "net_cls.classid"), + []byte(strconv.FormatUint(uint64(*resources.Network.ClassID), 10)), + defaultFilePerm, + ) + } + return nil +} + +func (n *netclsController) Update(path string, resources *specs.LinuxResources) error { + return n.Create(path, resources) +} diff --git a/vendor/github.com/containerd/cgroups/net_prio.go b/vendor/github.com/containerd/cgroups/net_prio.go new file mode 100644 index 0000000000000..6362fd084f716 --- /dev/null +++ b/vendor/github.com/containerd/cgroups/net_prio.go @@ -0,0 +1,65 @@ +/* + Copyright The containerd Authors. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package cgroups + +import ( + "fmt" + "os" + "path/filepath" + + specs "github.com/opencontainers/runtime-spec/specs-go" +) + +func NewNetPrio(root string) *netprioController { + return &netprioController{ + root: filepath.Join(root, string(NetPrio)), + } +} + +type netprioController struct { + root string +} + +func (n *netprioController) Name() Name { + return NetPrio +} + +func (n *netprioController) Path(path string) string { + return filepath.Join(n.root, path) +} + +func (n *netprioController) Create(path string, resources *specs.LinuxResources) error { + if err := os.MkdirAll(n.Path(path), defaultDirPerm); err != nil { + return err + } + if resources.Network != nil { + for _, prio := range resources.Network.Priorities { + if err := retryingWriteFile( + filepath.Join(n.Path(path), "net_prio.ifpriomap"), + formatPrio(prio.Name, prio.Priority), + defaultFilePerm, + ); err != nil { + return err + } + } + } + return nil +} + +func formatPrio(name string, prio uint32) []byte { + return []byte(fmt.Sprintf("%s %d", name, prio)) +} diff --git a/vendor/github.com/containerd/cgroups/opts.go b/vendor/github.com/containerd/cgroups/opts.go new file mode 100644 index 0000000000000..1e428d0480794 --- /dev/null +++ b/vendor/github.com/containerd/cgroups/opts.go @@ -0,0 +1,61 @@ +/* + Copyright The containerd Authors. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package cgroups + +import ( + "errors" +) + +var ( + // ErrIgnoreSubsystem allows the specific subsystem to be skipped + ErrIgnoreSubsystem = errors.New("skip subsystem") + // ErrDevicesRequired is returned when the devices subsystem is required but + // does not exist or is not active + ErrDevicesRequired = errors.New("devices subsystem is required") +) + +// InitOpts allows configuration for the creation or loading of a cgroup +type InitOpts func(*InitConfig) error + +// InitConfig provides configuration options for the creation +// or loading of a cgroup and its subsystems +type InitConfig struct { + // InitCheck can be used to check initialization errors from the subsystem + InitCheck InitCheck +} + +func newInitConfig() *InitConfig { + return &InitConfig{ + InitCheck: RequireDevices, + } +} + +// InitCheck allows subsystems errors to be checked when initialized or loaded +type InitCheck func(Subsystem, Path, error) error + +// AllowAny allows any subsystem errors to be skipped +func AllowAny(_ Subsystem, _ Path, _ error) error { + return ErrIgnoreSubsystem +} + +// RequireDevices requires the device subsystem but no others +func RequireDevices(s Subsystem, _ Path, _ error) error { + if s.Name() == Devices { + return ErrDevicesRequired + } + return ErrIgnoreSubsystem +} diff --git a/vendor/github.com/containerd/cgroups/paths.go b/vendor/github.com/containerd/cgroups/paths.go new file mode 100644 index 0000000000000..bddc4e9cdcd7e --- /dev/null +++ b/vendor/github.com/containerd/cgroups/paths.go @@ -0,0 +1,106 @@ +/* + Copyright The containerd Authors. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package cgroups + +import ( + "errors" + "fmt" + "path/filepath" +) + +type Path func(subsystem Name) (string, error) + +func RootPath(subsystem Name) (string, error) { + return "/", nil +} + +// StaticPath returns a static path to use for all cgroups +func StaticPath(path string) Path { + return func(_ Name) (string, error) { + return path, nil + } +} + +// NestedPath will nest the cgroups based on the calling processes cgroup +// placing its child processes inside its own path +func NestedPath(suffix string) Path { + paths, err := ParseCgroupFile("/proc/self/cgroup") + if err != nil { + return errorPath(err) + } + return existingPath(paths, suffix) +} + +// PidPath will return the correct cgroup paths for an existing process running inside a cgroup +// This is commonly used for the Load function to restore an existing container +func PidPath(pid int) Path { + p := fmt.Sprintf("/proc/%d/cgroup", pid) + paths, err := ParseCgroupFile(p) + if err != nil { + return errorPath(fmt.Errorf("parse cgroup file %s: %w", p, err)) + } + return existingPath(paths, "") +} + +// ErrControllerNotActive is returned when a controller is not supported or enabled +var ErrControllerNotActive = errors.New("controller is not supported") + +func existingPath(paths map[string]string, suffix string) Path { + // localize the paths based on the root mount dest for nested cgroups + for n, p := range paths { + dest, err := getCgroupDestination(n) + if err != nil { + return errorPath(err) + } + rel, err := filepath.Rel(dest, p) + if err != nil { + return errorPath(err) + } + if rel == "." { + rel = dest + } + paths[n] = filepath.Join("/", rel) + } + return func(name Name) (string, error) { + root, ok := paths[string(name)] + if !ok { + if root, ok = paths["name="+string(name)]; !ok { + return "", ErrControllerNotActive + } + } + if suffix != "" { + return filepath.Join(root, suffix), nil + } + return root, nil + } +} + +func subPath(path Path, subName string) Path { + return func(name Name) (string, error) { + p, err := path(name) + if err != nil { + return "", err + } + return filepath.Join(p, subName), nil + } +} + +func errorPath(err error) Path { + return func(_ Name) (string, error) { + return "", err + } +} diff --git a/vendor/github.com/containerd/cgroups/perf_event.go b/vendor/github.com/containerd/cgroups/perf_event.go new file mode 100644 index 0000000000000..648786db68f6f --- /dev/null +++ b/vendor/github.com/containerd/cgroups/perf_event.go @@ -0,0 +1,37 @@ +/* + Copyright The containerd Authors. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package cgroups + +import "path/filepath" + +func NewPerfEvent(root string) *PerfEventController { + return &PerfEventController{ + root: filepath.Join(root, string(PerfEvent)), + } +} + +type PerfEventController struct { + root string +} + +func (p *PerfEventController) Name() Name { + return PerfEvent +} + +func (p *PerfEventController) Path(path string) string { + return filepath.Join(p.root, path) +} diff --git a/vendor/github.com/containerd/cgroups/pids.go b/vendor/github.com/containerd/cgroups/pids.go new file mode 100644 index 0000000000000..66a1b6b44708e --- /dev/null +++ b/vendor/github.com/containerd/cgroups/pids.go @@ -0,0 +1,85 @@ +/* + Copyright The containerd Authors. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package cgroups + +import ( + "os" + "path/filepath" + "strconv" + "strings" + + v1 "github.com/containerd/cgroups/stats/v1" + specs "github.com/opencontainers/runtime-spec/specs-go" +) + +func NewPids(root string) *pidsController { + return &pidsController{ + root: filepath.Join(root, string(Pids)), + } +} + +type pidsController struct { + root string +} + +func (p *pidsController) Name() Name { + return Pids +} + +func (p *pidsController) Path(path string) string { + return filepath.Join(p.root, path) +} + +func (p *pidsController) Create(path string, resources *specs.LinuxResources) error { + if err := os.MkdirAll(p.Path(path), defaultDirPerm); err != nil { + return err + } + if resources.Pids != nil && resources.Pids.Limit > 0 { + return retryingWriteFile( + filepath.Join(p.Path(path), "pids.max"), + []byte(strconv.FormatInt(resources.Pids.Limit, 10)), + defaultFilePerm, + ) + } + return nil +} + +func (p *pidsController) Update(path string, resources *specs.LinuxResources) error { + return p.Create(path, resources) +} + +func (p *pidsController) Stat(path string, stats *v1.Metrics) error { + current, err := readUint(filepath.Join(p.Path(path), "pids.current")) + if err != nil { + return err + } + var max uint64 + maxData, err := os.ReadFile(filepath.Join(p.Path(path), "pids.max")) + if err != nil { + return err + } + if maxS := strings.TrimSpace(string(maxData)); maxS != "max" { + if max, err = parseUint(maxS, 10, 64); err != nil { + return err + } + } + stats.Pids = &v1.PidsStat{ + Current: current, + Limit: max, + } + return nil +} diff --git a/vendor/github.com/containerd/cgroups/rdma.go b/vendor/github.com/containerd/cgroups/rdma.go new file mode 100644 index 0000000000000..9d414203e42ed --- /dev/null +++ b/vendor/github.com/containerd/cgroups/rdma.go @@ -0,0 +1,154 @@ +/* + Copyright The containerd Authors. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package cgroups + +import ( + "math" + "os" + "path/filepath" + "strconv" + "strings" + + v1 "github.com/containerd/cgroups/stats/v1" + specs "github.com/opencontainers/runtime-spec/specs-go" +) + +type rdmaController struct { + root string +} + +func (p *rdmaController) Name() Name { + return Rdma +} + +func (p *rdmaController) Path(path string) string { + return filepath.Join(p.root, path) +} + +func NewRdma(root string) *rdmaController { + return &rdmaController{ + root: filepath.Join(root, string(Rdma)), + } +} + +func createCmdString(device string, limits *specs.LinuxRdma) string { + var cmdString string + + cmdString = device + if limits.HcaHandles != nil { + cmdString = cmdString + " " + "hca_handle=" + strconv.FormatUint(uint64(*limits.HcaHandles), 10) + } + + if limits.HcaObjects != nil { + cmdString = cmdString + " " + "hca_object=" + strconv.FormatUint(uint64(*limits.HcaObjects), 10) + } + return cmdString +} + +func (p *rdmaController) Create(path string, resources *specs.LinuxResources) error { + if err := os.MkdirAll(p.Path(path), defaultDirPerm); err != nil { + return err + } + + for device, limit := range resources.Rdma { + if device != "" && (limit.HcaHandles != nil || limit.HcaObjects != nil) { + limit := limit + return retryingWriteFile( + filepath.Join(p.Path(path), "rdma.max"), + []byte(createCmdString(device, &limit)), + defaultFilePerm, + ) + } + } + return nil +} + +func (p *rdmaController) Update(path string, resources *specs.LinuxResources) error { + return p.Create(path, resources) +} + +func parseRdmaKV(raw string, entry *v1.RdmaEntry) { + var value uint64 + var err error + + parts := strings.Split(raw, "=") + switch len(parts) { + case 2: + if parts[1] == "max" { + value = math.MaxUint32 + } else { + value, err = parseUint(parts[1], 10, 32) + if err != nil { + return + } + } + if parts[0] == "hca_handle" { + entry.HcaHandles = uint32(value) + } else if parts[0] == "hca_object" { + entry.HcaObjects = uint32(value) + } + } +} + +func toRdmaEntry(strEntries []string) []*v1.RdmaEntry { + var rdmaEntries []*v1.RdmaEntry + for i := range strEntries { + parts := strings.Fields(strEntries[i]) + switch len(parts) { + case 3: + entry := new(v1.RdmaEntry) + entry.Device = parts[0] + parseRdmaKV(parts[1], entry) + parseRdmaKV(parts[2], entry) + + rdmaEntries = append(rdmaEntries, entry) + default: + continue + } + } + return rdmaEntries +} + +func (p *rdmaController) Stat(path string, stats *v1.Metrics) error { + + currentData, err := os.ReadFile(filepath.Join(p.Path(path), "rdma.current")) + if err != nil { + return err + } + currentPerDevices := strings.Split(string(currentData), "\n") + + maxData, err := os.ReadFile(filepath.Join(p.Path(path), "rdma.max")) + if err != nil { + return err + } + maxPerDevices := strings.Split(string(maxData), "\n") + + // If device got removed between reading two files, ignore returning + // stats. + if len(currentPerDevices) != len(maxPerDevices) { + return nil + } + + currentEntries := toRdmaEntry(currentPerDevices) + maxEntries := toRdmaEntry(maxPerDevices) + + stats.Rdma = &v1.RdmaStat{ + Current: currentEntries, + Limit: maxEntries, + } + return nil +} diff --git a/vendor/github.com/containerd/cgroups/state.go b/vendor/github.com/containerd/cgroups/state.go new file mode 100644 index 0000000000000..cfeabbbc60b3a --- /dev/null +++ b/vendor/github.com/containerd/cgroups/state.go @@ -0,0 +1,28 @@ +/* + Copyright The containerd Authors. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package cgroups + +// State is a type that represents the state of the current cgroup +type State string + +const ( + Unknown State = "" + Thawed State = "thawed" + Frozen State = "frozen" + Freezing State = "freezing" + Deleted State = "deleted" +) diff --git a/vendor/github.com/containerd/cgroups/subsystem.go b/vendor/github.com/containerd/cgroups/subsystem.go new file mode 100644 index 0000000000000..b2f41854d2c51 --- /dev/null +++ b/vendor/github.com/containerd/cgroups/subsystem.go @@ -0,0 +1,116 @@ +/* + Copyright The containerd Authors. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package cgroups + +import ( + "fmt" + "os" + + v1 "github.com/containerd/cgroups/stats/v1" + specs "github.com/opencontainers/runtime-spec/specs-go" +) + +// Name is a typed name for a cgroup subsystem +type Name string + +const ( + Devices Name = "devices" + Hugetlb Name = "hugetlb" + Freezer Name = "freezer" + Pids Name = "pids" + NetCLS Name = "net_cls" + NetPrio Name = "net_prio" + PerfEvent Name = "perf_event" + Cpuset Name = "cpuset" + Cpu Name = "cpu" + Cpuacct Name = "cpuacct" + Memory Name = "memory" + Blkio Name = "blkio" + Rdma Name = "rdma" +) + +// Subsystems returns a complete list of the default cgroups +// available on most linux systems +func Subsystems() []Name { + n := []Name{ + Freezer, + Pids, + NetCLS, + NetPrio, + PerfEvent, + Cpuset, + Cpu, + Cpuacct, + Memory, + Blkio, + Rdma, + } + if !RunningInUserNS() { + n = append(n, Devices) + } + if _, err := os.Stat("/sys/kernel/mm/hugepages"); err == nil { + n = append(n, Hugetlb) + } + return n +} + +type Subsystem interface { + Name() Name +} + +type pather interface { + Subsystem + Path(path string) string +} + +type creator interface { + Subsystem + Create(path string, resources *specs.LinuxResources) error +} + +type deleter interface { + Subsystem + Delete(path string) error +} + +type stater interface { + Subsystem + Stat(path string, stats *v1.Metrics) error +} + +type updater interface { + Subsystem + Update(path string, resources *specs.LinuxResources) error +} + +// SingleSubsystem returns a single cgroup subsystem within the base Hierarchy +func SingleSubsystem(baseHierarchy Hierarchy, subsystem Name) Hierarchy { + return func() ([]Subsystem, error) { + subsystems, err := baseHierarchy() + if err != nil { + return nil, err + } + for _, s := range subsystems { + if s.Name() == subsystem { + return []Subsystem{ + s, + }, nil + } + } + return nil, fmt.Errorf("unable to find subsystem %s", subsystem) + } +} diff --git a/vendor/github.com/containerd/cgroups/systemd.go b/vendor/github.com/containerd/cgroups/systemd.go new file mode 100644 index 0000000000000..4da57cb4b00b5 --- /dev/null +++ b/vendor/github.com/containerd/cgroups/systemd.go @@ -0,0 +1,158 @@ +/* + Copyright The containerd Authors. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package cgroups + +import ( + "context" + "path/filepath" + "strings" + "sync" + + systemdDbus "github.com/coreos/go-systemd/v22/dbus" + "github.com/godbus/dbus/v5" + specs "github.com/opencontainers/runtime-spec/specs-go" +) + +const ( + SystemdDbus Name = "systemd" + defaultSlice = "system.slice" +) + +var ( + canDelegate bool + once sync.Once +) + +func Systemd() ([]Subsystem, error) { + root, err := v1MountPoint() + if err != nil { + return nil, err + } + defaultSubsystems, err := defaults(root) + if err != nil { + return nil, err + } + s, err := NewSystemd(root) + if err != nil { + return nil, err + } + // make sure the systemd controller is added first + return append([]Subsystem{s}, defaultSubsystems...), nil +} + +func Slice(slice, name string) Path { + if slice == "" { + slice = defaultSlice + } + return func(subsystem Name) (string, error) { + return filepath.Join(slice, name), nil + } +} + +func NewSystemd(root string) (*SystemdController, error) { + return &SystemdController{ + root: root, + }, nil +} + +type SystemdController struct { + mu sync.Mutex + root string +} + +func (s *SystemdController) Name() Name { + return SystemdDbus +} + +func (s *SystemdController) Create(path string, _ *specs.LinuxResources) error { + ctx := context.TODO() + conn, err := systemdDbus.NewWithContext(ctx) + if err != nil { + return err + } + defer conn.Close() + slice, name := splitName(path) + // We need to see if systemd can handle the delegate property + // Systemd will return an error if it cannot handle delegate regardless + // of its bool setting. + checkDelegate := func() { + canDelegate = true + dlSlice := newProperty("Delegate", true) + if _, err := conn.StartTransientUnitContext(ctx, slice, "testdelegate", []systemdDbus.Property{dlSlice}, nil); err != nil { + if dbusError, ok := err.(dbus.Error); ok { + // Starting with systemd v237, Delegate is not even a property of slices anymore, + // so the D-Bus call fails with "InvalidArgs" error. + if strings.Contains(dbusError.Name, "org.freedesktop.DBus.Error.PropertyReadOnly") || strings.Contains(dbusError.Name, "org.freedesktop.DBus.Error.InvalidArgs") { + canDelegate = false + } + } + } + + _, _ = conn.StopUnitContext(ctx, slice, "testDelegate", nil) + } + once.Do(checkDelegate) + properties := []systemdDbus.Property{ + systemdDbus.PropDescription("cgroup " + name), + systemdDbus.PropWants(slice), + newProperty("DefaultDependencies", false), + newProperty("MemoryAccounting", true), + newProperty("CPUAccounting", true), + newProperty("BlockIOAccounting", true), + } + + // If we can delegate, we add the property back in + if canDelegate { + properties = append(properties, newProperty("Delegate", true)) + } + + ch := make(chan string) + _, err = conn.StartTransientUnitContext(ctx, name, "replace", properties, ch) + if err != nil { + return err + } + <-ch + return nil +} + +func (s *SystemdController) Delete(path string) error { + ctx := context.TODO() + conn, err := systemdDbus.NewWithContext(ctx) + if err != nil { + return err + } + defer conn.Close() + _, name := splitName(path) + ch := make(chan string) + _, err = conn.StopUnitContext(ctx, name, "replace", ch) + if err != nil { + return err + } + <-ch + return nil +} + +func newProperty(name string, units interface{}) systemdDbus.Property { + return systemdDbus.Property{ + Name: name, + Value: dbus.MakeVariant(units), + } +} + +func splitName(path string) (slice string, unit string) { + slice, unit = filepath.Split(path) + return strings.TrimSuffix(slice, "/"), unit +} diff --git a/vendor/github.com/containerd/cgroups/ticks.go b/vendor/github.com/containerd/cgroups/ticks.go new file mode 100644 index 0000000000000..84dc38d0cc331 --- /dev/null +++ b/vendor/github.com/containerd/cgroups/ticks.go @@ -0,0 +1,26 @@ +/* + Copyright The containerd Authors. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package cgroups + +func getClockTicks() uint64 { + // The value comes from `C.sysconf(C._SC_CLK_TCK)`, and + // on Linux it's a constant which is safe to be hard coded, + // so we can avoid using cgo here. + // See https://github.com/containerd/cgroups/pull/12 for + // more details. + return 100 +} diff --git a/vendor/github.com/containerd/cgroups/utils.go b/vendor/github.com/containerd/cgroups/utils.go new file mode 100644 index 0000000000000..c17a3a41423a5 --- /dev/null +++ b/vendor/github.com/containerd/cgroups/utils.go @@ -0,0 +1,391 @@ +/* + Copyright The containerd Authors. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package cgroups + +import ( + "bufio" + "errors" + "fmt" + "io" + "os" + "path/filepath" + "strconv" + "strings" + "sync" + "syscall" + "time" + + units "github.com/docker/go-units" + specs "github.com/opencontainers/runtime-spec/specs-go" + "golang.org/x/sys/unix" +) + +var ( + nsOnce sync.Once + inUserNS bool + checkMode sync.Once + cgMode CGMode +) + +const unifiedMountpoint = "/sys/fs/cgroup" + +// CGMode is the cgroups mode of the host system +type CGMode int + +const ( + // Unavailable cgroup mountpoint + Unavailable CGMode = iota + // Legacy cgroups v1 + Legacy + // Hybrid with cgroups v1 and v2 controllers mounted + Hybrid + // Unified with only cgroups v2 mounted + Unified +) + +// Mode returns the cgroups mode running on the host +func Mode() CGMode { + checkMode.Do(func() { + var st unix.Statfs_t + if err := unix.Statfs(unifiedMountpoint, &st); err != nil { + cgMode = Unavailable + return + } + switch st.Type { + case unix.CGROUP2_SUPER_MAGIC: + cgMode = Unified + default: + cgMode = Legacy + if err := unix.Statfs(filepath.Join(unifiedMountpoint, "unified"), &st); err != nil { + return + } + if st.Type == unix.CGROUP2_SUPER_MAGIC { + cgMode = Hybrid + } + } + }) + return cgMode +} + +// RunningInUserNS detects whether we are currently running in a user namespace. +// Copied from github.com/lxc/lxd/shared/util.go +func RunningInUserNS() bool { + nsOnce.Do(func() { + file, err := os.Open("/proc/self/uid_map") + if err != nil { + // This kernel-provided file only exists if user namespaces are supported + return + } + defer file.Close() + + buf := bufio.NewReader(file) + l, _, err := buf.ReadLine() + if err != nil { + return + } + + line := string(l) + var a, b, c int64 + fmt.Sscanf(line, "%d %d %d", &a, &b, &c) + + /* + * We assume we are in the initial user namespace if we have a full + * range - 4294967295 uids starting at uid 0. + */ + if a == 0 && b == 0 && c == 4294967295 { + return + } + inUserNS = true + }) + return inUserNS +} + +// defaults returns all known groups +func defaults(root string) ([]Subsystem, error) { + h, err := NewHugetlb(root) + if err != nil && !os.IsNotExist(err) { + return nil, err + } + s := []Subsystem{ + NewNamed(root, "systemd"), + NewFreezer(root), + NewPids(root), + NewNetCls(root), + NewNetPrio(root), + NewPerfEvent(root), + NewCpuset(root), + NewCpu(root), + NewCpuacct(root), + NewMemory(root), + NewBlkio(root), + NewRdma(root), + } + // only add the devices cgroup if we are not in a user namespace + // because modifications are not allowed + if !RunningInUserNS() { + s = append(s, NewDevices(root)) + } + // add the hugetlb cgroup if error wasn't due to missing hugetlb + // cgroup support on the host + if err == nil { + s = append(s, h) + } + return s, nil +} + +// remove will remove a cgroup path handling EAGAIN and EBUSY errors and +// retrying the remove after a exp timeout +func remove(path string) error { + delay := 10 * time.Millisecond + for i := 0; i < 5; i++ { + if i != 0 { + time.Sleep(delay) + delay *= 2 + } + if err := os.RemoveAll(path); err == nil { + return nil + } + } + return fmt.Errorf("cgroups: unable to remove path %q", path) +} + +// readPids will read all the pids of processes or tasks in a cgroup by the provided path +func readPids(path string, subsystem Name, pType procType) ([]Process, error) { + f, err := os.Open(filepath.Join(path, pType)) + if err != nil { + return nil, err + } + defer f.Close() + var ( + out []Process + s = bufio.NewScanner(f) + ) + for s.Scan() { + if t := s.Text(); t != "" { + pid, err := strconv.Atoi(t) + if err != nil { + return nil, err + } + out = append(out, Process{ + Pid: pid, + Subsystem: subsystem, + Path: path, + }) + } + } + if err := s.Err(); err != nil { + // failed to read all pids? + return nil, err + } + return out, nil +} + +func hugePageSizes() ([]string, error) { + var ( + pageSizes []string + sizeList = []string{"B", "KB", "MB", "GB", "TB", "PB"} + ) + files, err := os.ReadDir("/sys/kernel/mm/hugepages") + if err != nil { + return nil, err + } + for _, st := range files { + nameArray := strings.Split(st.Name(), "-") + pageSize, err := units.RAMInBytes(nameArray[1]) + if err != nil { + return nil, err + } + pageSizes = append(pageSizes, units.CustomSize("%g%s", float64(pageSize), 1024.0, sizeList)) + } + return pageSizes, nil +} + +func readUint(path string) (uint64, error) { + v, err := os.ReadFile(path) + if err != nil { + return 0, err + } + return parseUint(strings.TrimSpace(string(v)), 10, 64) +} + +func parseUint(s string, base, bitSize int) (uint64, error) { + v, err := strconv.ParseUint(s, base, bitSize) + if err != nil { + intValue, intErr := strconv.ParseInt(s, base, bitSize) + // 1. Handle negative values greater than MinInt64 (and) + // 2. Handle negative values lesser than MinInt64 + if intErr == nil && intValue < 0 { + return 0, nil + } else if intErr != nil && + intErr.(*strconv.NumError).Err == strconv.ErrRange && + intValue < 0 { + return 0, nil + } + return 0, err + } + return v, nil +} + +func parseKV(raw string) (string, uint64, error) { + parts := strings.Fields(raw) + switch len(parts) { + case 2: + v, err := parseUint(parts[1], 10, 64) + if err != nil { + return "", 0, err + } + return parts[0], v, nil + default: + return "", 0, ErrInvalidFormat + } +} + +// ParseCgroupFile parses the given cgroup file, typically /proc/self/cgroup +// or /proc//cgroup, into a map of subsystems to cgroup paths, e.g. +// "cpu": "/user.slice/user-1000.slice" +// "pids": "/user.slice/user-1000.slice" +// etc. +// +// The resulting map does not have an element for cgroup v2 unified hierarchy. +// Use ParseCgroupFileUnified to get the unified path. +func ParseCgroupFile(path string) (map[string]string, error) { + x, _, err := ParseCgroupFileUnified(path) + return x, err +} + +// ParseCgroupFileUnified returns legacy subsystem paths as the first value, +// and returns the unified path as the second value. +func ParseCgroupFileUnified(path string) (map[string]string, string, error) { + f, err := os.Open(path) + if err != nil { + return nil, "", err + } + defer f.Close() + return parseCgroupFromReaderUnified(f) +} + +func parseCgroupFromReaderUnified(r io.Reader) (map[string]string, string, error) { + var ( + cgroups = make(map[string]string) + unified = "" + s = bufio.NewScanner(r) + ) + for s.Scan() { + var ( + text = s.Text() + parts = strings.SplitN(text, ":", 3) + ) + if len(parts) < 3 { + return nil, unified, fmt.Errorf("invalid cgroup entry: %q", text) + } + for _, subs := range strings.Split(parts[1], ",") { + if subs == "" { + unified = parts[2] + } else { + cgroups[subs] = parts[2] + } + } + } + if err := s.Err(); err != nil { + return nil, unified, err + } + return cgroups, unified, nil +} + +func getCgroupDestination(subsystem string) (string, error) { + f, err := os.Open("/proc/self/mountinfo") + if err != nil { + return "", err + } + defer f.Close() + s := bufio.NewScanner(f) + for s.Scan() { + fields := strings.Split(s.Text(), " ") + if len(fields) < 10 { + // broken mountinfo? + continue + } + if fields[len(fields)-3] != "cgroup" { + continue + } + for _, opt := range strings.Split(fields[len(fields)-1], ",") { + if opt == subsystem { + return fields[3], nil + } + } + } + if err := s.Err(); err != nil { + return "", err + } + return "", ErrNoCgroupMountDestination +} + +func pathers(subystems []Subsystem) []pather { + var out []pather + for _, s := range subystems { + if p, ok := s.(pather); ok { + out = append(out, p) + } + } + return out +} + +func initializeSubsystem(s Subsystem, path Path, resources *specs.LinuxResources) error { + if c, ok := s.(creator); ok { + p, err := path(s.Name()) + if err != nil { + return err + } + if err := c.Create(p, resources); err != nil { + return err + } + } else if c, ok := s.(pather); ok { + p, err := path(s.Name()) + if err != nil { + return err + } + // do the default create if the group does not have a custom one + if err := os.MkdirAll(c.Path(p), defaultDirPerm); err != nil { + return err + } + } + return nil +} + +func cleanPath(path string) string { + if path == "" { + return "" + } + path = filepath.Clean(path) + if !filepath.IsAbs(path) { + path, _ = filepath.Rel(string(os.PathSeparator), filepath.Clean(string(os.PathSeparator)+path)) + } + return path +} + +func retryingWriteFile(path string, data []byte, mode os.FileMode) error { + // Retry writes on EINTR; see: + // https://github.com/golang/go/issues/38033 + for { + err := os.WriteFile(path, data, mode) + if err == nil { + return nil + } else if !errors.Is(err, syscall.EINTR) { + return err + } + } +} diff --git a/vendor/github.com/containerd/cgroups/v1.go b/vendor/github.com/containerd/cgroups/v1.go new file mode 100644 index 0000000000000..2ec215c06f424 --- /dev/null +++ b/vendor/github.com/containerd/cgroups/v1.go @@ -0,0 +1,73 @@ +/* + Copyright The containerd Authors. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package cgroups + +import ( + "bufio" + "fmt" + "os" + "path/filepath" + "strings" +) + +// V1 returns all the groups in the default cgroups mountpoint in a single hierarchy +func V1() ([]Subsystem, error) { + root, err := v1MountPoint() + if err != nil { + return nil, err + } + subsystems, err := defaults(root) + if err != nil { + return nil, err + } + var enabled []Subsystem + for _, s := range pathers(subsystems) { + // check and remove the default groups that do not exist + if _, err := os.Lstat(s.Path("/")); err == nil { + enabled = append(enabled, s) + } + } + return enabled, nil +} + +// v1MountPoint returns the mount point where the cgroup +// mountpoints are mounted in a single hiearchy +func v1MountPoint() (string, error) { + f, err := os.Open("/proc/self/mountinfo") + if err != nil { + return "", err + } + defer f.Close() + scanner := bufio.NewScanner(f) + for scanner.Scan() { + var ( + text = scanner.Text() + fields = strings.Split(text, " ") + numFields = len(fields) + ) + if numFields < 10 { + return "", fmt.Errorf("mountinfo: bad entry %q", text) + } + if fields[numFields-3] == "cgroup" { + return filepath.Dir(fields[4]), nil + } + } + if err := scanner.Err(); err != nil { + return "", err + } + return "", ErrMountPointNotExist +} diff --git a/vendor/modules.txt b/vendor/modules.txt index 81623628e64a9..1eb94c4ddc18e 100644 --- a/vendor/modules.txt +++ b/vendor/modules.txt @@ -145,6 +145,7 @@ github.com/cilium/ebpf/link github.com/container-storage-interface/spec/lib/go/csi # github.com/containerd/cgroups v1.1.0 ## explicit; go 1.17 +github.com/containerd/cgroups github.com/containerd/cgroups/stats/v1 # github.com/containerd/console v1.0.3 ## explicit; go 1.13