From 3954021c42a5561678384e39d7ec1a303963582a Mon Sep 17 00:00:00 2001 From: Frame Date: Tue, 16 Jan 2024 21:10:59 +0800 Subject: [PATCH] koordlet: fix recursively disabling bvt (#1848) Signed-off-by: saintube --- .../runtimehooks/hooks/groupidentity/rule.go | 70 +++++++++++-------- pkg/koordlet/util/node.go | 17 ++--- pkg/koordlet/util/node_test.go | 46 +++++++++--- 3 files changed, 88 insertions(+), 45 deletions(-) diff --git a/pkg/koordlet/runtimehooks/hooks/groupidentity/rule.go b/pkg/koordlet/runtimehooks/hooks/groupidentity/rule.go index d30c67db2..4f2b1c761 100644 --- a/pkg/koordlet/runtimehooks/hooks/groupidentity/rule.go +++ b/pkg/koordlet/runtimehooks/hooks/groupidentity/rule.go @@ -188,6 +188,7 @@ func (b *bvtPlugin) ruleUpdateCb(target *statesinformer.CallbackTarget) error { bvtUpdater, err := resourceexecutor.DefaultCgroupUpdaterFactory.New(sysutil.CPUBVTWarpNsName, kubeQOSCgroupPath, strconv.FormatInt(bvtValue, 10), e) if err != nil { klog.Infof("bvt updater create failed, dir %v, error %v", kubeQOSCgroupPath, err) + continue } if _, err = b.executor.Update(true, bvtUpdater); err != nil { klog.Infof("update kube qos %v cpu bvt failed, dir %v, error %v", kubeQOS, kubeQOSCgroupPath, err) @@ -200,17 +201,19 @@ func (b *bvtPlugin) ruleUpdateCb(target *statesinformer.CallbackTarget) error { } // FIXME(saintube): Currently the kernel feature core scheduling is strictly excluded with the group identity's - // bvt=-1. So we have to check and disable all the BE cgroups' bvt for the GroupIdentity before creating the - // core sched cookies. To keep the consistency of the cgroup tree's configuration, we list and update the cgroups - // by the level order when the group identity is globally disabled. - // This check should be removed after the kernel provides a more stable interface. + // bvt=-1. So we have to check and disable all the BE cgroups' bvt for the GroupIdentity before creating the + // core sched cookies. To keep the consistency of the cgroup tree's configuration, we list and update the cgroups + // by the level order when the group identity is globally disabled. + // This check should be removed after the kernel provides a more stable interface. podCgroupMap := map[string]int64{} // pod-level for _, kubeQOS := range []corev1.PodQOSClass{corev1.PodQOSGuaranteed, corev1.PodQOSBurstable, corev1.PodQOSBestEffort} { bvtValue := r.getKubeQOSDirBvtValue(kubeQOS) - podCgroupDirs, err := koordletutil.GetCgroupPathsByTargetDepth(kubeQOS, sysutil.CPUBVTWarpNsName, koordletutil.PodCgroupPathRelativeDepth) + kubeQOSParentDir := koordletutil.GetPodQoSRelativePath(kubeQOS) + podCgroupDirs, err := koordletutil.GetCgroupPathsByTargetDepth(sysutil.CPUBVTWarpNsName, kubeQOSParentDir, koordletutil.PodCgroupPathRelativeDepth) if err != nil { - return fmt.Errorf("get pod cgroup paths failed, qos %s, err: %w", kubeQOS, err) + klog.Infof("get pod cgroup paths failed, qos %s, err: %w", kubeQOS, err) + continue } for _, cgroupDir := range podCgroupDirs { if _, ok := qosCgroupMap[cgroupDir]; ok { // exclude qos cgroup @@ -228,12 +231,32 @@ func (b *bvtPlugin) ruleUpdateCb(target *statesinformer.CallbackTarget) error { bvtUpdater, err := resourceexecutor.DefaultCgroupUpdaterFactory.New(sysutil.CPUBVTWarpNsName, podCgroupPath, strconv.FormatInt(podBvt, 10), e) if err != nil { klog.Infof("bvt updater create failed, dir %v, error %v", podCgroupPath, err) + continue } if _, err = b.executor.Update(true, bvtUpdater); err != nil { klog.Infof("update pod %s cpu bvt failed, dir %v, error %v", util.GetPodKey(podMeta.Pod), podCgroupPath, err) } delete(podCgroupMap, podCgroupPath) + + // container-level + // NOTE: Although we do not set the container's cpu.bvt_warp_ns directly, it is inheritable from the pod-level, + // we have to handle the container-level only when we want to disable the group identity. + containerCgroupDirs, err := koordletutil.GetCgroupPathsByTargetDepth(sysutil.CPUBVTWarpNsName, podCgroupPath, 1) + if err != nil { + klog.Infof("get container cgroup paths failed, dir %s, error %v", podCgroupPath, err) + continue + } + for _, cgroupDir := range containerCgroupDirs { + bvtUpdater, err = resourceexecutor.DefaultCgroupUpdaterFactory.New(sysutil.CPUBVTWarpNsName, cgroupDir, strconv.FormatInt(podBvt, 10), e) + if err != nil { + klog.Infof("bvt updater create failed, dir %v, error %v", cgroupDir, err) + continue + } + if _, err = b.executor.Update(true, bvtUpdater); err != nil { + klog.Infof("update container cpu bvt failed, dir %v, error %v", cgroupDir, err) + } + } } for _, hostApp := range target.HostApplications { hostCtx := protocol.HooksProtocolBuilder.HostApp(&hostApp) @@ -250,35 +273,26 @@ func (b *bvtPlugin) ruleUpdateCb(target *statesinformer.CallbackTarget) error { bvtUpdater, err := resourceexecutor.DefaultCgroupUpdaterFactory.New(sysutil.CPUBVTWarpNsName, podCgroupDir, strconv.FormatInt(bvtValue, 10), e) if err != nil { klog.Infof("bvt updater create failed, dir %v, error %v", podCgroupDir, err) + continue } if _, err = b.executor.Update(true, bvtUpdater); err != nil { klog.Infof("update remaining pod cpu bvt failed, dir %v, error %v", podCgroupDir, err) } - } - // container-level - // NOTE: Although we do not set the container's cpu.bvt_warp_ns directly, it is inheritable from the pod-level, - // we have to handle the container-level only when we want to disable the group identity. - if !r.getEnable() { - for _, kubeQOS := range []corev1.PodQOSClass{corev1.PodQOSGuaranteed, corev1.PodQOSBurstable, corev1.PodQOSBestEffort} { - bvtValue := r.getKubeQOSDirBvtValue(kubeQOS) - containerCgroupDirs, err := koordletutil.GetCgroupPathsByTargetDepth(kubeQOS, sysutil.CPUBVTWarpNsName, koordletutil.ContainerCgroupPathRelativeDepth) + // container-level + containerCgroupDirs, err := koordletutil.GetCgroupPathsByTargetDepth(sysutil.CPUBVTWarpNsName, podCgroupDir, 1) + if err != nil { + klog.Infof("get container cgroup paths failed, dir %s, error %v", podCgroupDir, err) + continue + } + for _, cgroupDir := range containerCgroupDirs { + bvtUpdater, err = resourceexecutor.DefaultCgroupUpdaterFactory.New(sysutil.CPUBVTWarpNsName, cgroupDir, strconv.FormatInt(bvtValue, 10), e) if err != nil { - return fmt.Errorf("get container cgroup paths failed, qos %s, err: %w", kubeQOS, err) + klog.Infof("bvt updater create failed, dir %v, error %v", cgroupDir, err) + continue } - for _, cgroupDir := range containerCgroupDirs { - if _, ok := podCgroupMap[cgroupDir]; ok { - continue - } - - e := audit.V(3).Reason(name).Message("set bvt to %v", bvtValue) - bvtUpdater, err := resourceexecutor.DefaultCgroupUpdaterFactory.New(sysutil.CPUBVTWarpNsName, cgroupDir, strconv.FormatInt(bvtValue, 10), e) - if err != nil { - klog.Infof("bvt updater create failed, dir %v, error %v", cgroupDir, err) - } - if _, err = b.executor.Update(true, bvtUpdater); err != nil { - klog.Infof("update container cpu bvt failed, dir %v, error %v", cgroupDir, err) - } + if _, err = b.executor.Update(true, bvtUpdater); err != nil { + klog.Infof("update remaining container cpu bvt failed, dir %v, error %v", cgroupDir, err) } } } diff --git a/pkg/koordlet/util/node.go b/pkg/koordlet/util/node.go index 5408c7928..f40e38cc2 100644 --- a/pkg/koordlet/util/node.go +++ b/pkg/koordlet/util/node.go @@ -92,25 +92,25 @@ func GetBECPUSetPathsByMaxDepth(relativeDepth int) ([]string, error) { return paths, err } -func GetCgroupPathsByTargetDepth(qos corev1.PodQOSClass, resourceType system.ResourceType, relativeDepth int) ([]string, error) { - rootCgroupParentDir := GetPodQoSRelativePath(qos) +func GetCgroupPathsByTargetDepth(resourceType system.ResourceType, cgroupParent string, relativeDepth int) ([]string, error) { r, err := system.GetCgroupResource(resourceType) if err != nil { return nil, fmt.Errorf("get resource type failed, err: %w", err) } cgroupResource := r.(*system.CgroupResource) - rootCgroupPath := filepath.Dir(r.Path(rootCgroupParentDir)) + absCgroupParent := filepath.Dir(r.Path(cgroupParent)) rootSubfsPath := system.GetRootCgroupSubfsDir(cgroupResource.Subfs) - _, err = os.Stat(rootCgroupPath) + _, err = os.Stat(absCgroupParent) if err != nil { // make sure the rootCgroupPath is available return nil, err } - klog.V(6).Infof("get rootCgroupPath, qos %s, resource %s, path: %s", qos, resourceType, rootCgroupPath) + klog.V(6).Infof("get rootCgroupPath, resource %s, parent %s, absolute path: %s", + resourceType, cgroupParent, absCgroupParent) - absDepth := strings.Count(rootCgroupPath, string(os.PathSeparator)) + relativeDepth + absDepth := strings.Count(absCgroupParent, string(os.PathSeparator)) + relativeDepth var containerPaths []string - err = filepath.WalkDir(rootCgroupPath, func(path string, info os.DirEntry, err error) error { + err = filepath.WalkDir(absCgroupParent, func(path string, info os.DirEntry, err error) error { if err != nil { return err } @@ -129,7 +129,8 @@ func GetCgroupPathsByTargetDepth(qos corev1.PodQOSClass, resourceType system.Res // GetBECPUSetPathsByTargetDepth only gets the be containers' cpuset groups' paths func GetBECPUSetPathsByTargetDepth(relativeDepth int) ([]string, error) { - return GetCgroupPathsByTargetDepth(corev1.PodQOSBestEffort, system.CPUSetCPUSName, relativeDepth) + beCgroupParentDir := GetPodQoSRelativePath(corev1.PodQOSBestEffort) + return GetCgroupPathsByTargetDepth(system.CPUSetCPUSName, beCgroupParentDir, relativeDepth) } // GetCgroupRootBlkIOAbsoluteDir gets the root blkio directory diff --git a/pkg/koordlet/util/node_test.go b/pkg/koordlet/util/node_test.go index 2bd94b66b..4c1003318 100644 --- a/pkg/koordlet/util/node_test.go +++ b/pkg/koordlet/util/node_test.go @@ -52,8 +52,8 @@ func TestGetCgroupPathsByTargetDepth(t *testing.T) { prepareFn func(helper *system.FileTestUtil) } type args struct { - qos corev1.PodQOSClass resourceType system.ResourceType + parentDir string relativeDepth int } tests := []struct { @@ -66,7 +66,7 @@ func TestGetCgroupPathsByTargetDepth(t *testing.T) { { name: "get cgroup resource failed", args: args{ - qos: corev1.PodQOSGuaranteed, + parentDir: GetPodQoSRelativePath(corev1.PodQOSGuaranteed), resourceType: "unknown_resource_xxx", relativeDepth: 1, }, @@ -75,7 +75,7 @@ func TestGetCgroupPathsByTargetDepth(t *testing.T) { { name: "get root cgroup path failed", args: args{ - qos: corev1.PodQOSGuaranteed, + parentDir: GetPodQoSRelativePath(corev1.PodQOSGuaranteed), resourceType: system.CPUSetCPUSName, relativeDepth: 1, }, @@ -92,7 +92,7 @@ func TestGetCgroupPathsByTargetDepth(t *testing.T) { }, }, args: args{ - qos: corev1.PodQOSGuaranteed, + parentDir: GetPodQoSRelativePath(corev1.PodQOSGuaranteed), resourceType: system.CPUSetCPUSName, relativeDepth: 0, }, @@ -122,7 +122,7 @@ func TestGetCgroupPathsByTargetDepth(t *testing.T) { }, }, args: args{ - qos: corev1.PodQOSBestEffort, + parentDir: GetPodQoSRelativePath(corev1.PodQOSBestEffort), resourceType: system.CPUSetCPUSName, relativeDepth: PodCgroupPathRelativeDepth, }, @@ -153,7 +153,7 @@ func TestGetCgroupPathsByTargetDepth(t *testing.T) { }, }, args: args{ - qos: corev1.PodQOSBestEffort, + parentDir: GetPodQoSRelativePath(corev1.PodQOSBestEffort), resourceType: system.CPUSetCPUSName, relativeDepth: ContainerCgroupPathRelativeDepth, }, @@ -184,7 +184,7 @@ func TestGetCgroupPathsByTargetDepth(t *testing.T) { }, }, args: args{ - qos: corev1.PodQOSBestEffort, + parentDir: GetPodQoSRelativePath(corev1.PodQOSBestEffort), resourceType: system.MemoryLimitName, relativeDepth: ContainerCgroupPathRelativeDepth, }, @@ -215,7 +215,7 @@ func TestGetCgroupPathsByTargetDepth(t *testing.T) { }, }, args: args{ - qos: corev1.PodQOSGuaranteed, + parentDir: GetPodQoSRelativePath(corev1.PodQOSGuaranteed), resourceType: system.CPUSetCPUSName, relativeDepth: PodCgroupPathRelativeDepth, }, @@ -226,6 +226,34 @@ func TestGetCgroupPathsByTargetDepth(t *testing.T) { filepath.Join(GetPodQoSRelativePath(corev1.PodQOSGuaranteed), "/kubepods-test-guaranteed-pod.slice"), }, }, + { + name: "get for custom cgroup parent successfully", + fields: fields{ + prepareFn: func(helper *system.FileTestUtil) { + helper.SetCgroupsV2(false) + cpuset, _ := system.GetCgroupResource(system.CPUSetCPUSName) + cgroupParentDir := "test" + helper.WriteCgroupFileContents(cgroupParentDir, cpuset, "0-63") + containerCgroupDir := filepath.Join(cgroupParentDir, "/test-pod-0.slice") + containerCgroupDir1 := filepath.Join(cgroupParentDir, "/test-pod-1.slice") + containerCgroupDir2 := filepath.Join(cgroupParentDir, "/test-pod-2.slice") + helper.WriteCgroupFileContents(containerCgroupDir, cpuset, "0-63") + helper.WriteCgroupFileContents(containerCgroupDir1, cpuset, "0-63") + helper.WriteCgroupFileContents(containerCgroupDir2, cpuset, "0-31") + }, + }, + args: args{ + parentDir: "test", + resourceType: system.CPUSetCPUSName, + relativeDepth: 1, + }, + wantErr: false, + want: []string{ + "test/test-pod-0.slice", + "test/test-pod-1.slice", + "test/test-pod-2.slice", + }, + }, } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { @@ -235,7 +263,7 @@ func TestGetCgroupPathsByTargetDepth(t *testing.T) { tt.fields.prepareFn(helper) } - got, gotErr := GetCgroupPathsByTargetDepth(tt.args.qos, tt.args.resourceType, tt.args.relativeDepth) + got, gotErr := GetCgroupPathsByTargetDepth(tt.args.resourceType, tt.args.parentDir, tt.args.relativeDepth) assert.Equal(t, tt.wantErr, gotErr != nil, gotErr) assert.Equal(t, tt.want, got) })