diff --git a/pkg/kubelet/cm/cgroup_manager_linux.go b/pkg/kubelet/cm/cgroup_manager_linux.go index 1cd5f662429f0..7e96dbe25ff23 100644 --- a/pkg/kubelet/cm/cgroup_manager_linux.go +++ b/pkg/kubelet/cm/cgroup_manager_linux.go @@ -17,7 +17,6 @@ limitations under the License. package cm import ( - "errors" "fmt" "os" "path" @@ -32,13 +31,11 @@ import ( "github.com/opencontainers/runc/libcontainer/cgroups/manager" cgroupsystemd "github.com/opencontainers/runc/libcontainer/cgroups/systemd" libcontainerconfigs "github.com/opencontainers/runc/libcontainer/configs" - v1 "k8s.io/api/core/v1" "k8s.io/klog/v2" v1helper "k8s.io/kubernetes/pkg/apis/core/v1/helper" utilruntime "k8s.io/apimachinery/pkg/util/runtime" "k8s.io/apimachinery/pkg/util/sets" - cmutil "k8s.io/kubernetes/pkg/kubelet/cm/util" "k8s.io/kubernetes/pkg/kubelet/metrics" ) @@ -139,11 +136,11 @@ type CgroupSubsystems struct { MountPoints map[string]string } -// cgroupManagerImpl implements the CgroupManager interface. -// Its a stateless object which can be used to -// update,create or delete any number of cgroups -// It relies on runc/libcontainer cgroup managers. -type cgroupManagerImpl struct { +// cgroupCommon implements common tasks +// that are valid for both cgroup v1 and v2. +// This prevents duplicating the code between +// v1 and v2 specific implementations. +type cgroupCommon struct { // subsystems holds information about all the // mounted cgroup subsystems on the node subsystems *CgroupSubsystems @@ -152,12 +149,20 @@ type cgroupManagerImpl struct { useSystemd bool } -// Make sure that cgroupManagerImpl implements the CgroupManager interface -var _ CgroupManager = &cgroupManagerImpl{} +// Make sure that cgroupV1impl and cgroupV2impl implement the CgroupManager interface +var _ CgroupManager = &cgroupV1impl{} +var _ CgroupManager = &cgroupV2impl{} // NewCgroupManager is a factory method that returns a CgroupManager func NewCgroupManager(cs *CgroupSubsystems, cgroupDriver string) CgroupManager { - return &cgroupManagerImpl{ + if libcontainercgroups.IsCgroup2UnifiedMode() { + return NewCgroupV2Manager(cs, cgroupDriver) + } + return NewCgroupV1Manager(cs, cgroupDriver) +} + +func newCgroupCommon(cs *CgroupSubsystems, cgroupDriver string) cgroupCommon { + return cgroupCommon{ subsystems: cs, useSystemd: cgroupDriver == "systemd", } @@ -165,7 +170,7 @@ func NewCgroupManager(cs *CgroupSubsystems, cgroupDriver string) CgroupManager { // Name converts the cgroup to the driver specific value in cgroupfs form. // This always returns a valid cgroupfs path even when systemd driver is in use! -func (m *cgroupManagerImpl) Name(name CgroupName) string { +func (m *cgroupCommon) Name(name CgroupName) string { if m.useSystemd { return name.ToSystemd() } @@ -173,7 +178,7 @@ func (m *cgroupManagerImpl) Name(name CgroupName) string { } // CgroupName converts the literal cgroupfs name on the host to an internal identifier. -func (m *cgroupManagerImpl) CgroupName(name string) CgroupName { +func (m *cgroupCommon) CgroupName(name string) CgroupName { if m.useSystemd { return ParseSystemdToCgroupName(name) } @@ -181,7 +186,7 @@ func (m *cgroupManagerImpl) CgroupName(name string) CgroupName { } // buildCgroupPaths builds a path to each cgroup subsystem for the specified name. -func (m *cgroupManagerImpl) buildCgroupPaths(name CgroupName) map[string]string { +func (m *cgroupCommon) buildCgroupPaths(name CgroupName) map[string]string { cgroupFsAdaptedName := m.Name(name) cgroupPaths := make(map[string]string, len(m.subsystems.MountPoints)) for key, val := range m.subsystems.MountPoints { @@ -190,14 +195,8 @@ func (m *cgroupManagerImpl) buildCgroupPaths(name CgroupName) map[string]string return cgroupPaths } -// buildCgroupUnifiedPath builds a path to the specified name. -func (m *cgroupManagerImpl) buildCgroupUnifiedPath(name CgroupName) string { - cgroupFsAdaptedName := m.Name(name) - return path.Join(cmutil.CgroupRoot, cgroupFsAdaptedName) -} - // libctCgroupConfig converts CgroupConfig to libcontainer's Cgroup config. -func (m *cgroupManagerImpl) libctCgroupConfig(in *CgroupConfig, needResources bool) *libcontainerconfigs.Cgroup { +func (m *cgroupCommon) libctCgroupConfig(in *CgroupConfig, needResources bool) *libcontainerconfigs.Cgroup { config := &libcontainerconfigs.Cgroup{ Systemd: m.useSystemd, } @@ -235,62 +234,8 @@ func (m *cgroupManagerImpl) libctCgroupConfig(in *CgroupConfig, needResources bo return config } -// Validate checks if all subsystem cgroups already exist -func (m *cgroupManagerImpl) Validate(name CgroupName) error { - if libcontainercgroups.IsCgroup2UnifiedMode() { - cgroupPath := m.buildCgroupUnifiedPath(name) - neededControllers := getSupportedUnifiedControllers() - enabledControllers, err := readUnifiedControllers(cgroupPath) - if err != nil { - return fmt.Errorf("could not read controllers for cgroup %q: %w", name, err) - } - difference := neededControllers.Difference(enabledControllers) - if difference.Len() > 0 { - return fmt.Errorf("cgroup %q has some missing controllers: %v", name, strings.Join(sets.List(difference), ", ")) - } - return nil // valid V2 cgroup - } - - // Get map of all cgroup paths on the system for the particular cgroup - cgroupPaths := m.buildCgroupPaths(name) - - // the presence of alternative control groups not known to runc confuses - // the kubelet existence checks. - // ideally, we would have a mechanism in runc to support Exists() logic - // scoped to the set control groups it understands. this is being discussed - // in https://github.com/opencontainers/runc/issues/1440 - // once resolved, we can remove this code. - allowlistControllers := sets.New[string]("cpu", "cpuacct", "cpuset", "memory", "systemd", "pids") - - if _, ok := m.subsystems.MountPoints["hugetlb"]; ok { - allowlistControllers.Insert("hugetlb") - } - var missingPaths []string - // If even one cgroup path doesn't exist, then the cgroup doesn't exist. - for controller, path := range cgroupPaths { - // ignore mounts we don't care about - if !allowlistControllers.Has(controller) { - continue - } - if !libcontainercgroups.PathExists(path) { - missingPaths = append(missingPaths, path) - } - } - - if len(missingPaths) > 0 { - return fmt.Errorf("cgroup %q has some missing paths: %v", name, strings.Join(missingPaths, ", ")) - } - - return nil // valid V1 cgroup -} - -// Exists checks if all subsystem cgroups already exist -func (m *cgroupManagerImpl) Exists(name CgroupName) bool { - return m.Validate(name) == nil -} - // Destroy destroys the specified cgroup -func (m *cgroupManagerImpl) Destroy(cgroupConfig *CgroupConfig) error { +func (m *cgroupCommon) Destroy(cgroupConfig *CgroupConfig) error { start := time.Now() defer func() { metrics.CgroupManagerDuration.WithLabelValues("destroy").Observe(metrics.SinceInSeconds(start)) @@ -321,38 +266,12 @@ func getCPUWeight(cpuShares *uint64) uint64 { return 1 + ((*cpuShares-2)*9999)/262142 } -// readUnifiedControllers reads the controllers available at the specified cgroup -func readUnifiedControllers(path string) (sets.Set[string], error) { - controllersFileContent, err := os.ReadFile(filepath.Join(path, "cgroup.controllers")) - if err != nil { - return nil, err - } - controllers := strings.Fields(string(controllersFileContent)) - return sets.New(controllers...), nil -} - var ( availableRootControllersOnce sync.Once availableRootControllers sets.Set[string] ) -// getSupportedUnifiedControllers returns a set of supported controllers when running on cgroup v2 -func getSupportedUnifiedControllers() sets.Set[string] { - // This is the set of controllers used by the Kubelet - supportedControllers := sets.New("cpu", "cpuset", "memory", "hugetlb", "pids") - // Memoize the set of controllers that are present in the root cgroup - availableRootControllersOnce.Do(func() { - var err error - availableRootControllers, err = readUnifiedControllers(cmutil.CgroupRoot) - if err != nil { - panic(fmt.Errorf("cannot read cgroup controllers at %s", cmutil.CgroupRoot)) - } - }) - // Return the set of controllers that are supported both by the Kubelet and by the kernel - return supportedControllers.Intersection(availableRootControllers) -} - -func (m *cgroupManagerImpl) toResources(resourceConfig *ResourceConfig) *libcontainerconfigs.Resources { +func (m *cgroupCommon) toResources(resourceConfig *ResourceConfig) *libcontainerconfigs.Resources { resources := &libcontainerconfigs.Resources{ SkipDevices: true, SkipFreezeOnSet: true, @@ -394,7 +313,7 @@ func (m *cgroupManagerImpl) toResources(resourceConfig *ResourceConfig) *libcont return resources } -func (m *cgroupManagerImpl) maybeSetHugetlb(resourceConfig *ResourceConfig, resources *libcontainerconfigs.Resources) { +func (m *cgroupCommon) maybeSetHugetlb(resourceConfig *ResourceConfig, resources *libcontainerconfigs.Resources) { // Check if hugetlb is supported. if libcontainercgroups.IsCgroup2UnifiedMode() { if !getSupportedUnifiedControllers().Has("hugetlb") { @@ -433,7 +352,7 @@ func (m *cgroupManagerImpl) maybeSetHugetlb(resourceConfig *ResourceConfig, reso } // Update updates the cgroup with the specified Cgroup Configuration -func (m *cgroupManagerImpl) Update(cgroupConfig *CgroupConfig) error { +func (m *cgroupCommon) Update(cgroupConfig *CgroupConfig) error { start := time.Now() defer func() { metrics.CgroupManagerDuration.WithLabelValues("update").Observe(metrics.SinceInSeconds(start)) @@ -448,7 +367,7 @@ func (m *cgroupManagerImpl) Update(cgroupConfig *CgroupConfig) error { } // Create creates the specified cgroup -func (m *cgroupManagerImpl) Create(cgroupConfig *CgroupConfig) error { +func (m *cgroupCommon) Create(cgroupConfig *CgroupConfig) error { start := time.Now() defer func() { metrics.CgroupManagerDuration.WithLabelValues("create").Observe(metrics.SinceInSeconds(start)) @@ -480,7 +399,7 @@ func (m *cgroupManagerImpl) Create(cgroupConfig *CgroupConfig) error { } // Scans through all subsystems to find pids associated with specified cgroup. -func (m *cgroupManagerImpl) Pids(name CgroupName) []int { +func (m *cgroupCommon) Pids(name CgroupName) []int { // we need the driver specific name cgroupFsName := m.Name(name) @@ -530,7 +449,7 @@ func (m *cgroupManagerImpl) Pids(name CgroupName) []int { } // ReduceCPULimits reduces the cgroup's cpu shares to the lowest possible value -func (m *cgroupManagerImpl) ReduceCPULimits(cgroupName CgroupName) error { +func (m *cgroupCommon) ReduceCPULimits(cgroupName CgroupName) error { // Set lowest possible CpuShares value for the cgroup minimumCPUShares := uint64(MinShares) resources := &ResourceConfig{ @@ -543,100 +462,7 @@ func (m *cgroupManagerImpl) ReduceCPULimits(cgroupName CgroupName) error { return m.Update(containerConfig) } -// MemoryUsage returns the current memory usage of the specified cgroup, -// as read from cgroupfs. -func (m *cgroupManagerImpl) MemoryUsage(name CgroupName) (int64, error) { - var path, file string - if libcontainercgroups.IsCgroup2UnifiedMode() { - path = m.buildCgroupUnifiedPath(name) - file = "memory.current" - } else { - mp, ok := m.subsystems.MountPoints["memory"] - if !ok { // should not happen - return -1, errors.New("no cgroup v1 mountpoint for memory controller found") - } - path = mp + "/" + m.Name(name) - file = "memory.usage_in_bytes" - } - val, err := fscommon.GetCgroupParamUint(path, file) - return int64(val), err -} - -// Convert cgroup v1 cpu.shares value to cgroup v2 cpu.weight -// https://github.com/kubernetes/enhancements/tree/master/keps/sig-node/2254-cgroup-v2#phase-1-convert-from-cgroups-v1-settings-to-v2 -func CpuSharesToCpuWeight(cpuShares uint64) uint64 { - return uint64((((cpuShares - 2) * 9999) / 262142) + 1) -} - -// Convert cgroup v2 cpu.weight value to cgroup v1 cpu.shares -// https://github.com/kubernetes/enhancements/tree/master/keps/sig-node/2254-cgroup-v2#phase-1-convert-from-cgroups-v1-settings-to-v2 -func CpuWeightToCpuShares(cpuWeight uint64) uint64 { - return uint64((((cpuWeight - 1) * 262142) / 9999) + 2) -} - -func getCgroupv1CpuConfig(cgroupPath string) (*ResourceConfig, error) { - cpuQuotaStr, errQ := fscommon.GetCgroupParamString(cgroupPath, "cpu.cfs_quota_us") - if errQ != nil { - return nil, fmt.Errorf("failed to read CPU quota for cgroup %v: %v", cgroupPath, errQ) - } - cpuQuota, errInt := strconv.ParseInt(cpuQuotaStr, 10, 64) - if errInt != nil { - return nil, fmt.Errorf("failed to convert CPU quota as integer for cgroup %v: %v", cgroupPath, errInt) - } - cpuPeriod, errP := fscommon.GetCgroupParamUint(cgroupPath, "cpu.cfs_period_us") - if errP != nil { - return nil, fmt.Errorf("failed to read CPU period for cgroup %v: %v", cgroupPath, errP) - } - cpuShares, errS := fscommon.GetCgroupParamUint(cgroupPath, "cpu.shares") - if errS != nil { - return nil, fmt.Errorf("failed to read CPU shares for cgroup %v: %v", cgroupPath, errS) - } - return &ResourceConfig{CPUShares: &cpuShares, CPUQuota: &cpuQuota, CPUPeriod: &cpuPeriod}, nil -} - -func getCgroupv2CpuConfig(cgroupPath string) (*ResourceConfig, error) { - var cpuLimitStr, cpuPeriodStr string - cpuLimitAndPeriod, err := fscommon.GetCgroupParamString(cgroupPath, "cpu.max") - if err != nil { - return nil, fmt.Errorf("failed to read cpu.max file for cgroup %v: %v", cgroupPath, err) - } - numItems, errScan := fmt.Sscanf(cpuLimitAndPeriod, "%s %s", &cpuLimitStr, &cpuPeriodStr) - if errScan != nil || numItems != 2 { - return nil, fmt.Errorf("failed to correctly parse content of cpu.max file ('%s') for cgroup %v: %v", - cpuLimitAndPeriod, cgroupPath, errScan) - } - cpuLimit := int64(-1) - if cpuLimitStr != Cgroup2MaxCpuLimit { - cpuLimit, err = strconv.ParseInt(cpuLimitStr, 10, 64) - if err != nil { - return nil, fmt.Errorf("failed to convert CPU limit as integer for cgroup %v: %v", cgroupPath, err) - } - } - cpuPeriod, errPeriod := strconv.ParseUint(cpuPeriodStr, 10, 64) - if errPeriod != nil { - return nil, fmt.Errorf("failed to convert CPU period as integer for cgroup %v: %v", cgroupPath, errPeriod) - } - cpuWeight, errWeight := fscommon.GetCgroupParamUint(cgroupPath, "cpu.weight") - if errWeight != nil { - return nil, fmt.Errorf("failed to read CPU weight for cgroup %v: %v", cgroupPath, errWeight) - } - cpuShares := CpuWeightToCpuShares(cpuWeight) - return &ResourceConfig{CPUShares: &cpuShares, CPUQuota: &cpuLimit, CPUPeriod: &cpuPeriod}, nil -} - -func getCgroupCpuConfig(cgroupPath string) (*ResourceConfig, error) { - if libcontainercgroups.IsCgroup2UnifiedMode() { - return getCgroupv2CpuConfig(cgroupPath) - } else { - return getCgroupv1CpuConfig(cgroupPath) - } -} - -func getCgroupMemoryConfig(cgroupPath string) (*ResourceConfig, error) { - memLimitFile := "memory.limit_in_bytes" - if libcontainercgroups.IsCgroup2UnifiedMode() { - memLimitFile = "memory.max" - } +func readCgroupMemoryConfig(cgroupPath string, memLimitFile string) (*ResourceConfig, error) { memLimit, err := fscommon.GetCgroupParamUint(cgroupPath, memLimitFile) if err != nil { return nil, fmt.Errorf("failed to read %s for cgroup %v: %v", memLimitFile, cgroupPath, err) @@ -647,103 +473,11 @@ func getCgroupMemoryConfig(cgroupPath string) (*ResourceConfig, error) { } -// Get the resource config values applied to the cgroup for specified resource type -func (m *cgroupManagerImpl) GetCgroupConfig(name CgroupName, resource v1.ResourceName) (*ResourceConfig, error) { - cgroupPaths := m.buildCgroupPaths(name) - cgroupResourcePath, found := cgroupPaths[string(resource)] - if !found { - return nil, fmt.Errorf("failed to build %v cgroup fs path for cgroup %v", resource, name) - } - switch resource { - case v1.ResourceCPU: - return getCgroupCpuConfig(cgroupResourcePath) - case v1.ResourceMemory: - return getCgroupMemoryConfig(cgroupResourcePath) - } - return nil, fmt.Errorf("unsupported resource %v for cgroup %v", resource, name) -} - -func setCgroupv1CpuConfig(cgroupPath string, resourceConfig *ResourceConfig) error { - var cpuQuotaStr, cpuPeriodStr, cpuSharesStr string - if resourceConfig.CPUQuota != nil { - cpuQuotaStr = strconv.FormatInt(*resourceConfig.CPUQuota, 10) - if err := os.WriteFile(filepath.Join(cgroupPath, "cpu.cfs_quota_us"), []byte(cpuQuotaStr), 0700); err != nil { - return fmt.Errorf("failed to write %v to %v: %v", cpuQuotaStr, cgroupPath, err) - } - } - if resourceConfig.CPUPeriod != nil { - cpuPeriodStr = strconv.FormatUint(*resourceConfig.CPUPeriod, 10) - if err := os.WriteFile(filepath.Join(cgroupPath, "cpu.cfs_period_us"), []byte(cpuPeriodStr), 0700); err != nil { - return fmt.Errorf("failed to write %v to %v: %v", cpuPeriodStr, cgroupPath, err) - } - } - if resourceConfig.CPUShares != nil { - cpuSharesStr = strconv.FormatUint(*resourceConfig.CPUShares, 10) - if err := os.WriteFile(filepath.Join(cgroupPath, "cpu.shares"), []byte(cpuSharesStr), 0700); err != nil { - return fmt.Errorf("failed to write %v to %v: %v", cpuSharesStr, cgroupPath, err) - } - } - return nil -} - -func setCgroupv2CpuConfig(cgroupPath string, resourceConfig *ResourceConfig) error { - if resourceConfig.CPUQuota != nil { - if resourceConfig.CPUPeriod == nil { - return fmt.Errorf("CpuPeriod must be specified in order to set CpuLimit") - } - cpuLimitStr := Cgroup2MaxCpuLimit - if *resourceConfig.CPUQuota > -1 { - cpuLimitStr = strconv.FormatInt(*resourceConfig.CPUQuota, 10) - } - cpuPeriodStr := strconv.FormatUint(*resourceConfig.CPUPeriod, 10) - cpuMaxStr := fmt.Sprintf("%s %s", cpuLimitStr, cpuPeriodStr) - if err := os.WriteFile(filepath.Join(cgroupPath, "cpu.max"), []byte(cpuMaxStr), 0700); err != nil { - return fmt.Errorf("failed to write %v to %v: %v", cpuMaxStr, cgroupPath, err) - } - } - if resourceConfig.CPUShares != nil { - cpuWeight := CpuSharesToCpuWeight(*resourceConfig.CPUShares) - cpuWeightStr := strconv.FormatUint(cpuWeight, 10) - if err := os.WriteFile(filepath.Join(cgroupPath, "cpu.weight"), []byte(cpuWeightStr), 0700); err != nil { - return fmt.Errorf("failed to write %v to %v: %v", cpuWeightStr, cgroupPath, err) - } - } - return nil -} - -func setCgroupCpuConfig(cgroupPath string, resourceConfig *ResourceConfig) error { - if libcontainercgroups.IsCgroup2UnifiedMode() { - return setCgroupv2CpuConfig(cgroupPath, resourceConfig) - } else { - return setCgroupv1CpuConfig(cgroupPath, resourceConfig) - } -} - -func setCgroupMemoryConfig(cgroupPath string, resourceConfig *ResourceConfig) error { - memLimitFile := "memory.limit_in_bytes" - if libcontainercgroups.IsCgroup2UnifiedMode() { - memLimitFile = "memory.max" - } +func writeCgroupMemoryLimit(memoryLimitFileLocation string, resourceConfig *ResourceConfig) error { memLimit := strconv.FormatInt(*resourceConfig.Memory, 10) - if err := os.WriteFile(filepath.Join(cgroupPath, memLimitFile), []byte(memLimit), 0700); err != nil { - return fmt.Errorf("failed to write %v to %v/%v: %v", memLimit, cgroupPath, memLimitFile, err) + if err := os.WriteFile(memoryLimitFileLocation, []byte(memLimit), 0700); err != nil { + return fmt.Errorf("failed to write %v to %v: %w", memLimit, memoryLimitFileLocation, err) } //TODO(vinaykul,InPlacePodVerticalScaling): Add memory request support return nil } - -// Set resource config for the specified resource type on the cgroup -func (m *cgroupManagerImpl) SetCgroupConfig(name CgroupName, resource v1.ResourceName, resourceConfig *ResourceConfig) error { - cgroupPaths := m.buildCgroupPaths(name) - cgroupResourcePath, found := cgroupPaths[string(resource)] - if !found { - return fmt.Errorf("failed to build %v cgroup fs path for cgroup %v", resource, name) - } - switch resource { - case v1.ResourceCPU: - return setCgroupCpuConfig(cgroupResourcePath, resourceConfig) - case v1.ResourceMemory: - return setCgroupMemoryConfig(cgroupResourcePath, resourceConfig) - } - return nil -} diff --git a/pkg/kubelet/cm/cgroup_manager_linux_test.go b/pkg/kubelet/cm/cgroup_manager_linux_test.go index 0e28eb7cca87f..2974b5bc51f58 100644 --- a/pkg/kubelet/cm/cgroup_manager_linux_test.go +++ b/pkg/kubelet/cm/cgroup_manager_linux_test.go @@ -170,7 +170,7 @@ func TestParseSystemdToCgroupName(t *testing.T) { } } -func TestCpuSharesToCpuWeight(t *testing.T) { +func TestCpuSharesToCPUWeight(t *testing.T) { testCases := []struct { cpuShares uint64 expectedCpuWeight uint64 @@ -206,14 +206,14 @@ func TestCpuSharesToCpuWeight(t *testing.T) { } for _, testCase := range testCases { - if actual := CpuSharesToCpuWeight(testCase.cpuShares); actual != testCase.expectedCpuWeight { + if actual := cpuSharesToCPUWeight(testCase.cpuShares); actual != testCase.expectedCpuWeight { t.Errorf("cpuShares: %v, expectedCpuWeight: %v, actualCpuWeight: %v", testCase.cpuShares, testCase.expectedCpuWeight, actual) } } } -func TestCpuWeightToCpuShares(t *testing.T) { +func TestCpuWeightToCPUShares(t *testing.T) { testCases := []struct { cpuWeight uint64 expectedCpuShares uint64 @@ -245,7 +245,7 @@ func TestCpuWeightToCpuShares(t *testing.T) { } for _, testCase := range testCases { - if actual := CpuWeightToCpuShares(testCase.cpuWeight); actual != testCase.expectedCpuShares { + if actual := cpuWeightToCPUShares(testCase.cpuWeight); actual != testCase.expectedCpuShares { t.Errorf("cpuWeight: %v, expectedCpuShares: %v, actualCpuShares: %v", testCase.cpuWeight, testCase.expectedCpuShares, actual) } diff --git a/pkg/kubelet/cm/cgroup_v1_manager_linux.go b/pkg/kubelet/cm/cgroup_v1_manager_linux.go new file mode 100644 index 0000000000000..cffae2ff93ed5 --- /dev/null +++ b/pkg/kubelet/cm/cgroup_v1_manager_linux.go @@ -0,0 +1,186 @@ +/* +Copyright 2024 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package cm + +import ( + "errors" + "fmt" + "os" + "path/filepath" + "strconv" + "strings" + + libcontainercgroups "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/cgroups/fscommon" + + v1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/util/sets" +) + +const cgroupv1MemLimitFile string = "memory.limit_in_bytes" + +// cgroupV1impl implements the CgroupManager interface +// for cgroup v1. +// It's a stateless object which can be used to +// update, create or delete any number of cgroups +// It relies on runc/libcontainer cgroup managers. +type cgroupV1impl struct { + cgroupCommon +} + +func NewCgroupV1Manager(cs *CgroupSubsystems, cgroupDriver string) CgroupManager { + return &cgroupV1impl{ + cgroupCommon: newCgroupCommon(cs, cgroupDriver), + } +} + +// Validate checks if all subsystem cgroups are valid +func (c *cgroupV1impl) Validate(name CgroupName) error { + // Get map of all cgroup paths on the system for the particular cgroup + cgroupPaths := c.buildCgroupPaths(name) + + // the presence of alternative control groups not known to runc confuses + // the kubelet existence checks. + // ideally, we would have a mechanism in runc to support Exists() logic + // scoped to the set control groups it understands. this is being discussed + // in https://github.com/opencontainers/runc/issues/1440 + // once resolved, we can remove this code. + allowlistControllers := sets.New[string]("cpu", "cpuacct", "cpuset", "memory", "systemd", "pids") + + if _, ok := c.subsystems.MountPoints["hugetlb"]; ok { + allowlistControllers.Insert("hugetlb") + } + var missingPaths []string + // If even one cgroup path doesn't exist, then the cgroup doesn't exist. + for controller, path := range cgroupPaths { + // ignore mounts we don't care about + if !allowlistControllers.Has(controller) { + continue + } + if !libcontainercgroups.PathExists(path) { + missingPaths = append(missingPaths, path) + } + } + + if len(missingPaths) > 0 { + return fmt.Errorf("cgroup %q has some missing paths: %v", name, strings.Join(missingPaths, ", ")) + } + + return nil +} + +// Exists checks if all subsystem cgroups already exist +func (c *cgroupV1impl) Exists(name CgroupName) bool { + return c.Validate(name) == nil +} + +// MemoryUsage returns the current memory usage of the specified cgroup, +// as read from cgroupfs. +func (c *cgroupV1impl) MemoryUsage(name CgroupName) (int64, error) { + var path, file string + mp, ok := c.subsystems.MountPoints["memory"] + if !ok { // should not happen + return -1, errors.New("no cgroup v1 mountpoint for memory controller found") + } + path = mp + "/" + c.Name(name) + file = "memory.usage_in_bytes" + val, err := fscommon.GetCgroupParamUint(path, file) + return int64(val), err +} + +// Get the resource config values applied to the cgroup for specified resource type +func (c *cgroupV1impl) GetCgroupConfig(name CgroupName, resource v1.ResourceName) (*ResourceConfig, error) { + cgroupPaths := c.buildCgroupPaths(name) + cgroupResourcePath, found := cgroupPaths[string(resource)] + if !found { + return nil, fmt.Errorf("failed to build %v cgroup fs path for cgroup %v", resource, name) + } + switch resource { + case v1.ResourceCPU: + return c.getCgroupCPUConfig(cgroupResourcePath) + case v1.ResourceMemory: + return c.getCgroupMemoryConfig(cgroupResourcePath) + } + return nil, fmt.Errorf("unsupported resource %v for cgroup %v", resource, name) +} + +// Set resource config for the specified resource type on the cgroup +func (c *cgroupV1impl) SetCgroupConfig(name CgroupName, resource v1.ResourceName, resourceConfig *ResourceConfig) error { + cgroupPaths := c.buildCgroupPaths(name) + cgroupResourcePath, found := cgroupPaths[string(resource)] + if !found { + return fmt.Errorf("failed to build %v cgroup fs path for cgroup %v", resource, name) + } + switch resource { + case v1.ResourceCPU: + return c.setCgroupCPUConfig(cgroupResourcePath, resourceConfig) + case v1.ResourceMemory: + return c.setCgroupMemoryConfig(cgroupResourcePath, resourceConfig) + } + return nil +} + +func (c *cgroupV1impl) getCgroupCPUConfig(cgroupPath string) (*ResourceConfig, error) { + cpuQuotaStr, errQ := fscommon.GetCgroupParamString(cgroupPath, "cpu.cfs_quota_us") + if errQ != nil { + return nil, fmt.Errorf("failed to read CPU quota for cgroup %v: %w", cgroupPath, errQ) + } + cpuQuota, errInt := strconv.ParseInt(cpuQuotaStr, 10, 64) + if errInt != nil { + return nil, fmt.Errorf("failed to convert CPU quota as integer for cgroup %v: %w", cgroupPath, errInt) + } + cpuPeriod, errP := fscommon.GetCgroupParamUint(cgroupPath, "cpu.cfs_period_us") + if errP != nil { + return nil, fmt.Errorf("failed to read CPU period for cgroup %v: %w", cgroupPath, errP) + } + cpuShares, errS := fscommon.GetCgroupParamUint(cgroupPath, "cpu.shares") + if errS != nil { + return nil, fmt.Errorf("failed to read CPU shares for cgroup %v: %w", cgroupPath, errS) + } + return &ResourceConfig{CPUShares: &cpuShares, CPUQuota: &cpuQuota, CPUPeriod: &cpuPeriod}, nil +} + +func (c *cgroupV1impl) setCgroupCPUConfig(cgroupPath string, resourceConfig *ResourceConfig) error { + var cpuQuotaStr, cpuPeriodStr, cpuSharesStr string + if resourceConfig.CPUQuota != nil { + cpuQuotaStr = strconv.FormatInt(*resourceConfig.CPUQuota, 10) + if err := os.WriteFile(filepath.Join(cgroupPath, "cpu.cfs_quota_us"), []byte(cpuQuotaStr), 0700); err != nil { + return fmt.Errorf("failed to write %v to %v: %w", cpuQuotaStr, cgroupPath, err) + } + } + if resourceConfig.CPUPeriod != nil { + cpuPeriodStr = strconv.FormatUint(*resourceConfig.CPUPeriod, 10) + if err := os.WriteFile(filepath.Join(cgroupPath, "cpu.cfs_period_us"), []byte(cpuPeriodStr), 0700); err != nil { + return fmt.Errorf("failed to write %v to %v: %w", cpuPeriodStr, cgroupPath, err) + } + } + if resourceConfig.CPUShares != nil { + cpuSharesStr = strconv.FormatUint(*resourceConfig.CPUShares, 10) + if err := os.WriteFile(filepath.Join(cgroupPath, "cpu.shares"), []byte(cpuSharesStr), 0700); err != nil { + return fmt.Errorf("failed to write %v to %v: %w", cpuSharesStr, cgroupPath, err) + } + } + return nil +} + +func (c *cgroupV1impl) setCgroupMemoryConfig(cgroupPath string, resourceConfig *ResourceConfig) error { + return writeCgroupMemoryLimit(filepath.Join(cgroupPath, cgroupv1MemLimitFile), resourceConfig) +} + +func (c *cgroupV1impl) getCgroupMemoryConfig(cgroupPath string) (*ResourceConfig, error) { + return readCgroupMemoryConfig(cgroupPath, cgroupv1MemLimitFile) +} diff --git a/pkg/kubelet/cm/cgroup_v2_manager_linux.go b/pkg/kubelet/cm/cgroup_v2_manager_linux.go new file mode 100644 index 0000000000000..42261b7a7f6ac --- /dev/null +++ b/pkg/kubelet/cm/cgroup_v2_manager_linux.go @@ -0,0 +1,217 @@ +/* +Copyright 2024 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package cm + +import ( + "fmt" + "os" + "path" + "path/filepath" + "strconv" + "strings" + + "github.com/opencontainers/runc/libcontainer/cgroups/fscommon" + v1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/util/sets" + cmutil "k8s.io/kubernetes/pkg/kubelet/cm/util" +) + +const cgroupv2MemLimitFile string = "memory.max" + +// cgroupV2impl implements the CgroupManager interface +// for cgroup v2. +// It's a stateless object which can be used to +// update, create or delete any number of cgroups +// It relies on runc/libcontainer cgroup managers. +type cgroupV2impl struct { + cgroupCommon +} + +func NewCgroupV2Manager(cs *CgroupSubsystems, cgroupDriver string) CgroupManager { + return &cgroupV2impl{ + cgroupCommon: newCgroupCommon(cs, cgroupDriver), + } +} + +// Validate checks if all subsystem cgroups are valid +func (c *cgroupV2impl) Validate(name CgroupName) error { + cgroupPath := c.buildCgroupUnifiedPath(name) + neededControllers := getSupportedUnifiedControllers() + enabledControllers, err := readUnifiedControllers(cgroupPath) + if err != nil { + return fmt.Errorf("could not read controllers for cgroup %q: %w", name, err) + } + difference := neededControllers.Difference(enabledControllers) + if difference.Len() > 0 { + return fmt.Errorf("cgroup %q has some missing controllers: %v", name, strings.Join(sets.List(difference), ", ")) + } + return nil +} + +// Exists checks if all subsystem cgroups already exist +func (c *cgroupV2impl) Exists(name CgroupName) bool { + return c.Validate(name) == nil +} + +// MemoryUsage returns the current memory usage of the specified cgroup, +// as read from cgroupfs. +func (c *cgroupV2impl) MemoryUsage(name CgroupName) (int64, error) { + var path, file string + path = c.buildCgroupUnifiedPath(name) + file = "memory.current" + val, err := fscommon.GetCgroupParamUint(path, file) + return int64(val), err +} + +// Get the resource config values applied to the cgroup for specified resource type +func (c *cgroupV2impl) GetCgroupConfig(name CgroupName, resource v1.ResourceName) (*ResourceConfig, error) { + cgroupPaths := c.buildCgroupPaths(name) + cgroupResourcePath, found := cgroupPaths[string(resource)] + if !found { + return nil, fmt.Errorf("failed to build %v cgroup fs path for cgroup %v", resource, name) + } + switch resource { + case v1.ResourceCPU: + return c.getCgroupCPUConfig(cgroupResourcePath) + case v1.ResourceMemory: + return c.getCgroupMemoryConfig(cgroupResourcePath) + } + return nil, fmt.Errorf("unsupported resource %v for cgroup %v", resource, name) +} + +// Set resource config for the specified resource type on the cgroup +func (c *cgroupV2impl) SetCgroupConfig(name CgroupName, resource v1.ResourceName, resourceConfig *ResourceConfig) error { + cgroupPaths := c.buildCgroupPaths(name) + cgroupResourcePath, found := cgroupPaths[string(resource)] + if !found { + return fmt.Errorf("failed to build %v cgroup fs path for cgroup %v", resource, name) + } + switch resource { + case v1.ResourceCPU: + return c.setCgroupCPUConfig(cgroupResourcePath, resourceConfig) + case v1.ResourceMemory: + return c.setCgroupMemoryConfig(cgroupResourcePath, resourceConfig) + } + return nil +} + +func (c *cgroupV2impl) getCgroupCPUConfig(cgroupPath string) (*ResourceConfig, error) { + var cpuLimitStr, cpuPeriodStr string + cpuLimitAndPeriod, err := fscommon.GetCgroupParamString(cgroupPath, "cpu.max") + if err != nil { + return nil, fmt.Errorf("failed to read cpu.max file for cgroup %v: %w", cgroupPath, err) + } + numItems, errScan := fmt.Sscanf(cpuLimitAndPeriod, "%s %s", &cpuLimitStr, &cpuPeriodStr) + if errScan != nil || numItems != 2 { + return nil, fmt.Errorf("failed to correctly parse content of cpu.max file ('%s') for cgroup %v: %w", + cpuLimitAndPeriod, cgroupPath, errScan) + } + cpuLimit := int64(-1) + if cpuLimitStr != Cgroup2MaxCpuLimit { + cpuLimit, err = strconv.ParseInt(cpuLimitStr, 10, 64) + if err != nil { + return nil, fmt.Errorf("failed to convert CPU limit as integer for cgroup %v: %w", cgroupPath, err) + } + } + cpuPeriod, errPeriod := strconv.ParseUint(cpuPeriodStr, 10, 64) + if errPeriod != nil { + return nil, fmt.Errorf("failed to convert CPU period as integer for cgroup %v: %w", cgroupPath, errPeriod) + } + cpuWeight, errWeight := fscommon.GetCgroupParamUint(cgroupPath, "cpu.weight") + if errWeight != nil { + return nil, fmt.Errorf("failed to read CPU weight for cgroup %v: %w", cgroupPath, errWeight) + } + cpuShares := cpuWeightToCPUShares(cpuWeight) + return &ResourceConfig{CPUShares: &cpuShares, CPUQuota: &cpuLimit, CPUPeriod: &cpuPeriod}, nil +} + +func (c *cgroupV2impl) setCgroupCPUConfig(cgroupPath string, resourceConfig *ResourceConfig) error { + if resourceConfig.CPUQuota != nil { + if resourceConfig.CPUPeriod == nil { + return fmt.Errorf("CpuPeriod must be specified in order to set CpuLimit") + } + cpuLimitStr := Cgroup2MaxCpuLimit + if *resourceConfig.CPUQuota > -1 { + cpuLimitStr = strconv.FormatInt(*resourceConfig.CPUQuota, 10) + } + cpuPeriodStr := strconv.FormatUint(*resourceConfig.CPUPeriod, 10) + cpuMaxStr := fmt.Sprintf("%s %s", cpuLimitStr, cpuPeriodStr) + if err := os.WriteFile(filepath.Join(cgroupPath, "cpu.max"), []byte(cpuMaxStr), 0700); err != nil { + return fmt.Errorf("failed to write %v to %v: %w", cpuMaxStr, cgroupPath, err) + } + } + if resourceConfig.CPUShares != nil { + cpuWeight := cpuSharesToCPUWeight(*resourceConfig.CPUShares) + cpuWeightStr := strconv.FormatUint(cpuWeight, 10) + if err := os.WriteFile(filepath.Join(cgroupPath, "cpu.weight"), []byte(cpuWeightStr), 0700); err != nil { + return fmt.Errorf("failed to write %v to %v: %w", cpuWeightStr, cgroupPath, err) + } + } + return nil +} + +func (c *cgroupV2impl) setCgroupMemoryConfig(cgroupPath string, resourceConfig *ResourceConfig) error { + return writeCgroupMemoryLimit(filepath.Join(cgroupPath, cgroupv2MemLimitFile), resourceConfig) +} + +func (c *cgroupV2impl) getCgroupMemoryConfig(cgroupPath string) (*ResourceConfig, error) { + return readCgroupMemoryConfig(cgroupPath, cgroupv2MemLimitFile) +} + +// getSupportedUnifiedControllers returns a set of supported controllers when running on cgroup v2 +func getSupportedUnifiedControllers() sets.Set[string] { + // This is the set of controllers used by the Kubelet + supportedControllers := sets.New("cpu", "cpuset", "memory", "hugetlb", "pids") + // Memoize the set of controllers that are present in the root cgroup + availableRootControllersOnce.Do(func() { + var err error + availableRootControllers, err = readUnifiedControllers(cmutil.CgroupRoot) + if err != nil { + panic(fmt.Errorf("cannot read cgroup controllers at %s", cmutil.CgroupRoot)) + } + }) + // Return the set of controllers that are supported both by the Kubelet and by the kernel + return supportedControllers.Intersection(availableRootControllers) +} + +// readUnifiedControllers reads the controllers available at the specified cgroup +func readUnifiedControllers(path string) (sets.Set[string], error) { + controllersFileContent, err := os.ReadFile(filepath.Join(path, "cgroup.controllers")) + if err != nil { + return nil, err + } + controllers := strings.Fields(string(controllersFileContent)) + return sets.New(controllers...), nil +} + +// buildCgroupUnifiedPath builds a path to the specified name. +func (c *cgroupV2impl) buildCgroupUnifiedPath(name CgroupName) string { + cgroupFsAdaptedName := c.Name(name) + return path.Join(cmutil.CgroupRoot, cgroupFsAdaptedName) +} + +// Convert cgroup v1 cpu.shares value to cgroup v2 cpu.weight +// https://github.com/kubernetes/enhancements/tree/master/keps/sig-node/2254-cgroup-v2#phase-1-convert-from-cgroups-v1-settings-to-v2 +func cpuSharesToCPUWeight(cpuShares uint64) uint64 { + return uint64((((cpuShares - 2) * 9999) / 262142) + 1) +} + +// Convert cgroup v2 cpu.weight value to cgroup v1 cpu.shares +// https://github.com/kubernetes/enhancements/tree/master/keps/sig-node/2254-cgroup-v2#phase-1-convert-from-cgroups-v1-settings-to-v2 +func cpuWeightToCPUShares(cpuWeight uint64) uint64 { + return uint64((((cpuWeight - 1) * 262142) / 9999) + 2) +}