From 1d6a742b2bc39bcad29a4444f0634fa89dca86c1 Mon Sep 17 00:00:00 2001 From: saintube Date: Wed, 10 Jan 2024 15:35:39 +0800 Subject: [PATCH] koordlet: fix core sched conflicts with group identity koordlet: revise core sched group api Signed-off-by: saintube --- apis/slo/v1alpha1/pod.go | 56 +- pkg/koordlet/koordlet.go | 6 + pkg/koordlet/runtimehooks/config.go | 1 + .../hooks/coresched/core_sched.go | 109 +++- .../hooks/coresched/core_sched_test.go | 296 +++++++++- .../runtimehooks/hooks/coresched/helper.go | 30 +- .../hooks/coresched/helper_test.go | 134 +++-- .../runtimehooks/hooks/coresched/rule.go | 30 +- .../runtimehooks/hooks/coresched/rule_test.go | 75 ++- .../runtimehooks/hooks/groupidentity/bvt.go | 79 ++- .../hooks/groupidentity/bvt_test.go | 214 +++++-- .../hooks/groupidentity/interceptor.go | 10 +- .../hooks/groupidentity/interceptor_test.go | 92 ++- .../runtimehooks/hooks/groupidentity/rule.go | 96 +++- .../hooks/groupidentity/rule_test.go | 531 ++++++++++++++---- pkg/koordlet/util/node.go | 25 +- pkg/koordlet/util/node_test.go | 209 ++++++- pkg/koordlet/util/system/core_sched.go | 26 + pkg/koordlet/util/system/system_file.go | 16 +- 19 files changed, 1673 insertions(+), 362 deletions(-) diff --git a/apis/slo/v1alpha1/pod.go b/apis/slo/v1alpha1/pod.go index 501779127..e6c1a7e28 100644 --- a/apis/slo/v1alpha1/pod.go +++ b/apis/slo/v1alpha1/pod.go @@ -20,7 +20,6 @@ import ( "encoding/json" corev1 "k8s.io/api/core/v1" - "k8s.io/utils/pointer" apiext "github.com/koordinator-sh/koordinator/apis/extension" ) @@ -68,12 +67,8 @@ func GetPodMemoryQoSConfig(pod *corev1.Pod) (*PodMemoryQOSConfig, error) { const ( // LabelCoreSchedGroupID is the label key of the group ID of the Linux Core Scheduling. - // Value should be a valid UUID or the none value "0". - // When the value is a valid UUID, pods with that group ID and the equal CoreExpelled status on the node will be - // assigned to the same core sched cookie. - // When the value is the none value "0", pod will be reset to the default core sched cookie `0`. - // When the k-v pair is missing but the node-level strategy enables the core sched, the pod will be assigned an - // internal group according to the pod's UID. + // Value should be a valid UUID or empty. If it is empty, the pod is considered to belong to a core sched group "". + // Otherwise, the pod is set its core sched group ID according to the value. // // Core Sched: https://docs.kernel.org/admin-guide/hw-vuln/core-scheduling.html // When the Core Sched is enabled, pods with the different core sched group IDs will not be running at the same SMT @@ -85,20 +80,47 @@ const ( // are the same. LabelCoreSchedGroupID = apiext.DomainPrefix + "core-sched-group-id" - // CoreSchedGroupIDNone is the none value of the core sched group ID which indicates the core sched is disabled for + // LabelCoreSchedPolicy is the label key that indicates the policy of the core scheduling. + // It supports the following values: + // - "" (default): If the core sched is enabled for the node, the pod is set the group ID according to the value + // of the LabelCoreSchedGroupID. + // - "none": The core sched is explicitly disabled for the pod even if the node-level strategy is enabled. + // - "exclusive": If the core sched is enabled for the node, the pod is set the group ID according to the pod UID, + // so that the pod is exclusive to any other pods. + LabelCoreSchedPolicy = apiext.DomainPrefix + "core-sched-policy" +) + +type CoreSchedPolicy string + +const ( + // CoreSchedPolicyDefault is the default policy of the core scheduling which indicates the core sched group ID + // is set according to the value of the LabelCoreSchedGroupID. + CoreSchedPolicyDefault CoreSchedPolicy = "" + // CoreSchedPolicyNone is the none policy of the core scheduling which indicates the core sched is disabled for // the pod. The pod will be reset to the system-default cookie `0`. - CoreSchedGroupIDNone = "0" + CoreSchedPolicyNone CoreSchedPolicy = "none" + // CoreSchedPolicyExclusive is the exclusive policy of the core scheduling which indicates the core sched group ID + // is set the same as the pod's UID + CoreSchedPolicyExclusive CoreSchedPolicy = "exclusive" ) -// GetCoreSchedGroupID gets the core sched group ID from the pod labels. -// It returns the core sched group ID and whether the pod explicitly disables the core sched. -func GetCoreSchedGroupID(labels map[string]string) (string, *bool) { +// GetCoreSchedGroupID gets the core sched group ID for the pod according to the labels. +func GetCoreSchedGroupID(labels map[string]string) string { + if labels != nil { + return labels[LabelCoreSchedGroupID] + } + return "" +} + +// GetCoreSchedPolicy gets the core sched policy for the pod according to the labels. +func GetCoreSchedPolicy(labels map[string]string) CoreSchedPolicy { if labels == nil { - return "", nil + return CoreSchedPolicyDefault } - value, ok := labels[LabelCoreSchedGroupID] - if !ok { - return "", nil + if v := labels[LabelCoreSchedPolicy]; v == string(CoreSchedPolicyNone) { + return CoreSchedPolicyNone + } else if v == string(CoreSchedPolicyExclusive) { + return CoreSchedPolicyExclusive } - return value, pointer.Bool(value == CoreSchedGroupIDNone) + return CoreSchedPolicyDefault } diff --git a/pkg/koordlet/koordlet.go b/pkg/koordlet/koordlet.go index b16eea09b..a3bca892f 100644 --- a/pkg/koordlet/koordlet.go +++ b/pkg/koordlet/koordlet.go @@ -37,6 +37,7 @@ import ( "github.com/koordinator-sh/koordinator/pkg/koordlet/metricsadvisor" "github.com/koordinator-sh/koordinator/pkg/koordlet/prediction" "github.com/koordinator-sh/koordinator/pkg/koordlet/qosmanager" + "github.com/koordinator-sh/koordinator/pkg/koordlet/resourceexecutor" "github.com/koordinator-sh/koordinator/pkg/koordlet/runtimehooks" "github.com/koordinator-sh/koordinator/pkg/koordlet/statesinformer" statesinformerimpl "github.com/koordinator-sh/koordinator/pkg/koordlet/statesinformer/impl" @@ -63,6 +64,7 @@ type daemon struct { qosManager qosmanager.QOSManager runtimeHook runtimehooks.RuntimeHook predictServer prediction.PredictServer + executor resourceexecutor.ResourceUpdateExecutor } func NewDaemon(config *config.Configuration) (Daemon, error) { @@ -115,6 +117,7 @@ func NewDaemon(config *config.Configuration) (Daemon, error) { qosManager: qosManager, runtimeHook: runtimeHook, predictServer: predictServer, + executor: resourceexecutor.NewResourceUpdateExecutor(), } return d, nil @@ -163,6 +166,9 @@ func (d *daemon) Run(stopCh <-chan struct{}) { } }() + // start resource executor before the writers modules + go d.executor.Run(stopCh) + // start qos manager go func() { if err := d.qosManager.Run(stopCh); err != nil { diff --git a/pkg/koordlet/runtimehooks/config.go b/pkg/koordlet/runtimehooks/config.go index cd99502ee..dc8a747b0 100644 --- a/pkg/koordlet/runtimehooks/config.go +++ b/pkg/koordlet/runtimehooks/config.go @@ -69,6 +69,7 @@ const ( CPUNormalization featuregate.Feature = "CPUNormalization" // CoreSched manages Linux Core Scheduling cookies for containers who enable the core sched. + // NOTE: CoreSched is an alternative policy of the CPU QoS, and it is exclusive to the Group Identity feature. // // owner: @saintube @zwzhang0107 // alpha: v1.4 diff --git a/pkg/koordlet/runtimehooks/hooks/coresched/core_sched.go b/pkg/koordlet/runtimehooks/hooks/coresched/core_sched.go index e9b864c81..e05dd3ae4 100644 --- a/pkg/koordlet/runtimehooks/hooks/coresched/core_sched.go +++ b/pkg/koordlet/runtimehooks/hooks/coresched/core_sched.go @@ -27,7 +27,6 @@ import ( "k8s.io/utils/pointer" "github.com/koordinator-sh/koordinator/apis/extension" - slov1alpha1 "github.com/koordinator-sh/koordinator/apis/slo/v1alpha1" "github.com/koordinator-sh/koordinator/pkg/koordlet/metrics" "github.com/koordinator-sh/koordinator/pkg/koordlet/resourceexecutor" "github.com/koordinator-sh/koordinator/pkg/koordlet/runtimehooks/hooks" @@ -51,6 +50,8 @@ const ( // ExpellerGroupSuffix is the default suffix of the expeller core sched group. ExpellerGroupSuffix = "-expeller" + // NoneGroupID is the special ID denoting none core sched group. + NoneGroupID = "__0__" ) // SYSTEM QoS is excluded from the cookie mutating. @@ -66,9 +67,9 @@ type Plugin struct { initialized *atomic.Bool // whether the cache has been initialized allPodsSyncOnce sync.Once // sync once for AllPods - sysSupported *bool - supportedMsg string - sysEnabled bool + sysSupported *bool + supportedMsg string + giSysctlSupported *bool cookieCache *gocache.Cache // core-sched-group-id -> cookie id, set; if the group has had cookie cookieCacheRWMutex sync.RWMutex @@ -103,10 +104,12 @@ func (p *Plugin) Register(op hooks.Options) { // TODO: hook NRI events RunPodSandbox, PostStartContainer rule.Register(ruleNameForNodeSLO, description, rule.WithParseFunc(statesinformer.RegisterTypeNodeSLOSpec, p.parseRuleForNodeSLO), - rule.WithUpdateCallback(p.ruleUpdateCb)) + rule.WithUpdateCallback(p.ruleUpdateCb), + rule.WithSystemSupported(p.SystemSupported)) rule.Register(ruleNameForAllPods, description, rule.WithParseFunc(statesinformer.RegisterTypeAllPods, p.parseForAllPods), - rule.WithUpdateCallback(p.ruleUpdateCb)) + rule.WithUpdateCallback(p.ruleUpdateCb), + rule.WithSystemSupported(p.SystemSupported)) reconciler.RegisterCgroupReconciler(reconciler.ContainerLevel, sysutil.VirtualCoreSchedCookie, "set core sched cookie to process groups of container specified", p.SetContainerCookie, reconciler.PodQOSFilter(), podQOSConditions...) @@ -125,26 +128,15 @@ func (p *Plugin) Setup(op hooks.Options) { p.cse = sysutil.NewCoreSchedExtended() } -func (p *Plugin) SystemSupported() (bool, string) { +func (p *Plugin) SystemSupported() bool { if p.sysSupported == nil { - isSupported, msg := sysutil.EnableCoreSchedIfSupported() - p.sysSupported = pointer.Bool(isSupported) + sysSupported, msg := sysutil.IsCoreSchedSupported() + p.sysSupported = pointer.Bool(sysSupported) p.supportedMsg = msg - klog.Infof("update system supported info for plugin %s, supported %v, msg %s", - name, *p.sysSupported, p.supportedMsg) + klog.Infof("update system supported info to %v for plugin %v, supported msg %s", + sysSupported, name, msg) } - return *p.sysSupported, p.supportedMsg -} - -func (p *Plugin) InitCache(podMetas []*statesinformer.PodMeta) bool { - if p.initialized.Load() { - return true - } - - synced := p.LoadAllCookies(podMetas) - - p.initialized.Store(synced) - return synced + return *p.sysSupported } func (p *Plugin) IsCacheInited() bool { @@ -200,10 +192,17 @@ func (p *Plugin) SetContainerCookie(proto protocol.HooksProtocol) error { podUID := containerCtx.Request.PodMeta.UID // only process sandbox container or container has valid ID if len(podUID) <= 0 || len(containerCtx.Request.ContainerMeta.ID) <= 0 { - return fmt.Errorf("invalid container ID for plugin %s, pod UID %s, container ID %s", - name, podUID, containerCtx.Request.ContainerMeta.ID) + klog.V(5).Infof("aborted to manage cookie for container %s/%s, err: empty pod UID %s or container ID %s", + containerCtx.Request.PodMeta.String(), containerCtx.Request.ContainerMeta.Name, + podUID, containerCtx.Request.ContainerMeta.ID) + return nil } + if !p.SystemSupported() { + klog.V(6).Infof("skipped to set cookie for container %s/%s, core sched is unsupported, msg: %s", + containerCtx.Request.PodMeta.String(), containerCtx.Request.ContainerMeta.Name, p.supportedMsg) + return nil + } if !p.rule.IsInited() || !p.IsCacheInited() { klog.V(5).Infof("plugin %s has not been inited, rule inited %v, cache inited %v, aborted to set cookie for container %s/%s", name, p.rule.IsInited(), p.IsCacheInited(), containerCtx.Request.PodMeta.String(), containerCtx.Request.ContainerMeta.Name) @@ -219,6 +218,15 @@ func (p *Plugin) SetContainerCookie(proto protocol.HooksProtocol) error { // 1. disabled -> enabled: Add or Assign. // 2. keep enabled: Check the differences of cookie, group ID and the PIDs, and do Assign. if isEnabled { + // FIXME(saintube): Currently we need to ensure the group identity is disabled via sysctl before enabling + // the core sched cookie in the container reconciler, because the disabling during the rule update might + // fail. This check can be removed once the kernel feature provides a way to disable the group identity. + if err := p.initSystem(true); err != nil { + klog.V(4).Infof("plugin %s failed to initialize system for container %s/%s, err: %s", + name, containerCtx.Request.PodMeta.String(), containerCtx.Request.ContainerMeta.Name, err) + return nil + } + return p.enableContainerCookie(containerCtx, groupID) } // else pod disables @@ -226,8 +234,19 @@ func (p *Plugin) SetContainerCookie(proto protocol.HooksProtocol) error { return p.disableContainerCookie(containerCtx, groupID) } -// LoadAllCookies syncs the current core sched cookies of all pods into the cookie cache. -func (p *Plugin) LoadAllCookies(podMetas []*statesinformer.PodMeta) bool { +func (p *Plugin) initCache(podMetas []*statesinformer.PodMeta) bool { + if p.initialized.Load() { + return true + } + + synced := p.loadAllCookies(podMetas) + + p.initialized.Store(synced) + return synced +} + +// loadAllCookies syncs the current core sched cookies of all pods into the cookie cache. +func (p *Plugin) loadAllCookies(podMetas []*statesinformer.PodMeta) bool { hasSynced := false p.cookieCacheRWMutex.Lock() defer p.cookieCacheRWMutex.Unlock() @@ -301,6 +320,38 @@ func (p *Plugin) LoadAllCookies(podMetas []*statesinformer.PodMeta) bool { return hasSynced } +func (p *Plugin) initSystem(isEnabled bool) error { + if !isEnabled { // only switch sysctl if enabled + return nil + } + + // NOTE: Currently the kernel feature core scheduling is strictly excluded with the group identity's + // bvt=-1. So we have to check if the GroupIdentity can be disabled before creating core sched cookies. + err := p.tryDisableGroupIdentity() + if err != nil { + return err + } + + sysEnabled, msg := sysutil.EnableCoreSchedIfSupported() + if !sysEnabled { + return fmt.Errorf("failed to enable core sched, msg: %s", msg) + } + + return nil +} + +// tryDisableGroupIdentity tries disabling the group identity via sysctl to safely enable the core sched. +func (p *Plugin) tryDisableGroupIdentity() error { + if p.giSysctlSupported == nil { + p.giSysctlSupported = pointer.Bool(sysutil.IsGroupIdentitySysctlSupported()) + } + if !*p.giSysctlSupported { // not support either the group identity or the core sched + return nil + } + + return sysutil.SetSchedGroupIdentity(false) +} + // enableContainerCookie adds or assigns a core sched cookie for the container. func (p *Plugin) enableContainerCookie(containerCtx *protocol.ContainerContext, groupID string) error { podMetaName := containerCtx.Request.PodMeta.String() @@ -405,7 +456,7 @@ func (p *Plugin) disableContainerCookie(containerCtx *protocol.ContainerContext, // invalid lastGroupID means container not in group cache (container should be cleared or not ever added) // invalid lastCookieEntry means group not in cookie cache (group should be cleared) // let its cached PIDs expire or removed by siblings' Assign - if (len(lastGroupID) <= 0 || lastGroupID == slov1alpha1.CoreSchedGroupIDNone) && lastCookieEntry == nil { + if (len(lastGroupID) <= 0 || lastGroupID == NoneGroupID) && lastCookieEntry == nil { return nil } @@ -446,7 +497,7 @@ func (p *Plugin) getCookieCacheForContainer(groupID, containerUID string) (strin defer p.cookieCacheRWMutex.RUnlock() lastGroupIDIf, containerHasGroup := p.groupCache.Get(containerUID) - lastGroupID := slov1alpha1.CoreSchedGroupIDNone + lastGroupID := NoneGroupID if containerHasGroup { lastGroupID = lastGroupIDIf.(string) } diff --git a/pkg/koordlet/runtimehooks/hooks/coresched/core_sched_test.go b/pkg/koordlet/runtimehooks/hooks/coresched/core_sched_test.go index a94d7cedd..e544b0c5c 100644 --- a/pkg/koordlet/runtimehooks/hooks/coresched/core_sched_test.go +++ b/pkg/koordlet/runtimehooks/hooks/coresched/core_sched_test.go @@ -65,7 +65,7 @@ func TestPluginSystemSupported(t *testing.T) { name: "plugin unsupported since no sched features file", wants: wants{ systemSupported: false, - supportMsg: "core sched not supported", + supportMsg: "file not exist", }, }, { @@ -78,7 +78,7 @@ func TestPluginSystemSupported(t *testing.T) { }, wants: wants{ systemSupported: false, - supportMsg: "core sched not supported", + supportMsg: "not supported neither by sysctl nor by sched_features", }, }, { @@ -91,7 +91,7 @@ func TestPluginSystemSupported(t *testing.T) { }, wants: wants{ systemSupported: true, - supportMsg: "", + supportMsg: "sysctl supported", }, }, { @@ -104,6 +104,7 @@ func TestPluginSystemSupported(t *testing.T) { }, wants: wants{ systemSupported: true, + supportMsg: "sched_features supported", }, }, { @@ -116,6 +117,7 @@ func TestPluginSystemSupported(t *testing.T) { }, wants: wants{ systemSupported: true, + supportMsg: "sysctl supported", }, }, } @@ -132,9 +134,163 @@ func TestPluginSystemSupported(t *testing.T) { Reader: resourceexecutor.NewCgroupReader(), Executor: resourceexecutor.NewTestResourceExecutor(), }) - sysSupported, supportMsg := p.SystemSupported() + sysSupported := p.SystemSupported() assert.Equal(t, tt.wants.systemSupported, sysSupported) - assert.Equal(t, tt.wants.supportMsg, supportMsg) + assert.Equal(t, tt.wants.supportMsg, p.supportedMsg) + }) + } +} + +func TestPlugin_initSystem(t *testing.T) { + type fields struct { + prepareFn func(helper *sysutil.FileTestUtil) + giSupported *bool + } + tests := []struct { + name string + fields fields + arg bool + wantErr bool + wantField *bool + wantExtra func(t *testing.T, helper *sysutil.FileTestUtil) + }{ + { + name: "skip to init if rule disabled", + fields: fields{ + giSupported: pointer.Bool(true), + }, + arg: false, + wantErr: false, + wantField: pointer.Bool(true), + }, + { + name: "system does not support sysctl for group identity", + fields: fields{ + prepareFn: func(helper *sysutil.FileTestUtil) { + helper.WriteFileContents(sysutil.GetProcSysFilePath(sysutil.KernelSchedCore), "1") + }, + giSupported: nil, + }, + arg: true, + wantErr: false, + wantField: pointer.Bool(false), + }, + { + name: "already know system does not support sysctl for group identity", + fields: fields{ + prepareFn: func(helper *sysutil.FileTestUtil) { + helper.WriteFileContents(sysutil.GetProcSysFilePath(sysutil.KernelSchedCore), "1") + }, + giSupported: pointer.Bool(false), + }, + arg: true, + wantErr: false, + wantField: pointer.Bool(false), + }, + { + name: "successfully enable core sched and disable group identity", + fields: fields{ + prepareFn: func(helper *sysutil.FileTestUtil) { + helper.WriteFileContents(sysutil.GetProcSysFilePath(sysutil.KernelSchedCore), "0") + bvtConfigPath := sysutil.GetProcSysFilePath(sysutil.KernelSchedGroupIdentityEnable) + helper.WriteFileContents(bvtConfigPath, "1") + }, + }, + arg: true, + wantErr: false, + wantField: pointer.Bool(true), + wantExtra: func(t *testing.T, helper *sysutil.FileTestUtil) { + bvtConfigPath := sysutil.GetProcSysFilePath(sysutil.KernelSchedGroupIdentityEnable) + got := helper.ReadFileContents(bvtConfigPath) + assert.Equal(t, "0", got) + got = helper.ReadFileContents(sysutil.GetProcSysFilePath(sysutil.KernelSchedCore)) + assert.Equal(t, "1", got) + }, + }, + { + name: "successfully disable group identity", + fields: fields{ + prepareFn: func(helper *sysutil.FileTestUtil) { + helper.WriteFileContents(sysutil.GetProcSysFilePath(sysutil.KernelSchedCore), "1") + bvtConfigPath := sysutil.GetProcSysFilePath(sysutil.KernelSchedGroupIdentityEnable) + helper.WriteFileContents(bvtConfigPath, "1") + }, + }, + arg: true, + wantErr: false, + wantField: pointer.Bool(true), + wantExtra: func(t *testing.T, helper *sysutil.FileTestUtil) { + bvtConfigPath := sysutil.GetProcSysFilePath(sysutil.KernelSchedGroupIdentityEnable) + got := helper.ReadFileContents(bvtConfigPath) + assert.Equal(t, "0", got) + got = helper.ReadFileContents(sysutil.GetProcSysFilePath(sysutil.KernelSchedCore)) + assert.Equal(t, "1", got) + }, + }, + { + name: "successfully disable group identity for known sysctl support", + fields: fields{ + prepareFn: func(helper *sysutil.FileTestUtil) { + helper.WriteFileContents(sysutil.GetProcSysFilePath(sysutil.KernelSchedCore), "1") + bvtConfigPath := sysutil.GetProcSysFilePath(sysutil.KernelSchedGroupIdentityEnable) + helper.WriteFileContents(bvtConfigPath, "1") + }, + giSupported: pointer.Bool(true), + }, + arg: true, + wantErr: false, + wantField: pointer.Bool(true), + wantExtra: func(t *testing.T, helper *sysutil.FileTestUtil) { + bvtConfigPath := sysutil.GetProcSysFilePath(sysutil.KernelSchedGroupIdentityEnable) + got := helper.ReadFileContents(bvtConfigPath) + assert.Equal(t, "0", got) + got = helper.ReadFileContents(sysutil.GetProcSysFilePath(sysutil.KernelSchedCore)) + assert.Equal(t, "1", got) + }, + }, + { + name: "failed to disable group identity", + fields: fields{ + prepareFn: func(helper *sysutil.FileTestUtil) { + helper.WriteFileContents(sysutil.GetProcSysFilePath(sysutil.KernelSchedCore), "0") + }, + giSupported: pointer.Bool(true), + }, + arg: true, + wantErr: true, + wantField: pointer.Bool(true), + wantExtra: func(t *testing.T, helper *sysutil.FileTestUtil) { + got := helper.ReadFileContents(sysutil.GetProcSysFilePath(sysutil.KernelSchedCore)) + assert.Equal(t, "0", got) + }, + }, + { + name: "failed to enable core sched", + fields: fields{ + giSupported: pointer.Bool(false), + }, + arg: true, + wantErr: true, + wantField: pointer.Bool(false), + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + helper := sysutil.NewFileTestUtil(t) + defer helper.Cleanup() + if tt.fields.prepareFn != nil { + tt.fields.prepareFn(helper) + } + + p := newPlugin() + p.Setup(hooks.Options{}) + p.giSysctlSupported = tt.fields.giSupported + gotErr := p.initSystem(tt.arg) + assert.Equal(t, tt.wantErr, gotErr != nil, gotErr) + assert.Equal(t, tt.wantField, p.giSysctlSupported) + if tt.wantExtra != nil { + tt.wantExtra(t, helper) + } }) } } @@ -180,7 +336,7 @@ func TestPlugin_SetContainerCookie(t *testing.T) { wantErr: true, }, { - name: "missing container ID", + name: "abort for missing container ID", fields: fields{ plugin: newPlugin(), }, @@ -193,7 +349,7 @@ func TestPlugin_SetContainerCookie(t *testing.T) { CgroupParent: "kubepods.slice/kubepods-podxxxxxx.slice/cri-containerd-yyyyyy.scope", }, }, - wantErr: true, + wantErr: false, }, { name: "rule has not initialized", @@ -214,10 +370,34 @@ func TestPlugin_SetContainerCookie(t *testing.T) { }, wantErr: false, }, + { + name: "system does not support core sched", + fields: fields{ + plugin: testGetEnabledPlugin(), + preparePluginFn: func(p *Plugin) { + p.sysSupported = pointer.Bool(false) + }, + }, + arg: &protocol.ContainerContext{ + Request: protocol.ContainerRequest{ + PodMeta: protocol.PodMeta{ + Name: "test-pod", + UID: "xxxxxx", + }, + ContainerMeta: protocol.ContainerMeta{ + ID: "containerd://yyyyyy", + }, + CgroupParent: "kubepods.slice/kubepods-podxxxxxx.slice/cri-containerd-yyyyyy.scope", + }, + }, + wantErr: false, + }, { name: "add cookie for LS container correctly", fields: fields{ prepareFn: func(helper *sysutil.FileTestUtil) { + sysctlFeaturePath := sysutil.GetProcSysFilePath(sysutil.KernelSchedCore) + helper.WriteFileContents(sysctlFeaturePath, "1\n") helper.WriteCgroupFileContents("kubepods.slice/kubepods-podxxxxxx.slice/cri-containerd-yyyyyy.scope", sysutil.CPUProcs, "12344\n12345\n12346\n") helper.WriteCgroupFileContents("kubepods.slice/kubepods-podxxxxxx.slice/cri-containerd-yyyyyy.scope", sysutil.CPUProcsV2, "12344\n12345\n12346\n") }, @@ -277,6 +457,8 @@ func TestPlugin_SetContainerCookie(t *testing.T) { name: "failed to add cookie for LS container when core sched add failed", fields: fields{ prepareFn: func(helper *sysutil.FileTestUtil) { + sysctlFeaturePath := sysutil.GetProcSysFilePath(sysutil.KernelSchedCore) + helper.WriteFileContents(sysctlFeaturePath, "1\n") helper.WriteCgroupFileContents("kubepods.slice/kubepods-podxxxxxx.slice/cri-containerd-yyyyyy.scope", sysutil.CPUProcs, "12344\n12345\n12346\n") helper.WriteCgroupFileContents("kubepods.slice/kubepods-podxxxxxx.slice/cri-containerd-yyyyyy.scope", sysutil.CPUProcsV2, "12344\n12345\n12346\n") }, @@ -333,6 +515,8 @@ func TestPlugin_SetContainerCookie(t *testing.T) { name: "failed to add cookie for BE container when PIDs no longer exist", fields: fields{ prepareFn: func(helper *sysutil.FileTestUtil) { + sysctlFeaturePath := sysutil.GetProcSysFilePath(sysutil.KernelSchedCore) + helper.WriteFileContents(sysctlFeaturePath, "1\n") helper.WriteCgroupFileContents("kubepods.slice/kubepods-besteffort.slice/kubepods-besteffort-podxxxxxx.slice/cri-containerd-yyyyyy.scope", sysutil.CPUProcs, "12344\n12345\n12346\n") helper.WriteCgroupFileContents("kubepods.slice/kubepods-besteffort.slice/kubepods-besteffort-podxxxxxx.slice/cri-containerd-yyyyyy.scope", sysutil.CPUProcsV2, "12344\n12345\n12346\n") }, @@ -385,6 +569,10 @@ func TestPlugin_SetContainerCookie(t *testing.T) { name: "assign cookie for LS container correctly", fields: fields{ prepareFn: func(helper *sysutil.FileTestUtil) { + sysctlFeaturePath := sysutil.GetProcSysFilePath(sysutil.KernelSchedCore) + helper.WriteFileContents(sysctlFeaturePath, "0\n") + giSysctlPath := sysutil.GetProcSysFilePath(sysutil.KernelSchedGroupIdentityEnable) + helper.WriteFileContents(giSysctlPath, "1\n") helper.WriteCgroupFileContents("kubepods.slice/kubepods-podxxxxxx.slice/cri-containerd-yyyyyy.scope", sysutil.CPUProcs, "12344\n12345\n12346\n") helper.WriteCgroupFileContents("kubepods.slice/kubepods-podxxxxxx.slice/cri-containerd-yyyyyy.scope", sysutil.CPUProcsV2, "12344\n12345\n12346\n") }, @@ -454,6 +642,8 @@ func TestPlugin_SetContainerCookie(t *testing.T) { name: "failed to assign cookie for LS container but fallback to add correctly", fields: fields{ prepareFn: func(helper *sysutil.FileTestUtil) { + sysctlFeaturePath := sysutil.GetProcSysFilePath(sysutil.KernelSchedCore) + helper.WriteFileContents(sysctlFeaturePath, "1\n") helper.WriteCgroupFileContents("kubepods.slice/kubepods-podxxxxxx.slice/cri-containerd-yyyyyy.scope", sysutil.CPUProcs, "12344\n12345\n12346\n") helper.WriteCgroupFileContents("kubepods.slice/kubepods-podxxxxxx.slice/cri-containerd-yyyyyy.scope", sysutil.CPUProcsV2, "12344\n12345\n12346\n") }, @@ -521,6 +711,8 @@ func TestPlugin_SetContainerCookie(t *testing.T) { name: "failed to assign cookie for LS container neither add", fields: fields{ prepareFn: func(helper *sysutil.FileTestUtil) { + sysctlFeaturePath := sysutil.GetProcSysFilePath(sysutil.KernelSchedCore) + helper.WriteFileContents(sysctlFeaturePath, "1\n") helper.WriteCgroupFileContents("kubepods.slice/kubepods-podxxxxxx.slice/cri-containerd-yyyyyy.scope", sysutil.CPUProcs, "12344\n12345\n12346\n") helper.WriteCgroupFileContents("kubepods.slice/kubepods-podxxxxxx.slice/cri-containerd-yyyyyy.scope", sysutil.CPUProcsV2, "12344\n12345\n12346\n") }, @@ -580,6 +772,75 @@ func TestPlugin_SetContainerCookie(t *testing.T) { groupToCookie: map[string]uint64{}, }, }, + { + name: "failed to assign cookie for LS container since system init failed", + fields: fields{ + prepareFn: func(helper *sysutil.FileTestUtil) { + giSysctlPath := sysutil.GetProcSysFilePath(sysutil.KernelSchedGroupIdentityEnable) + helper.WriteFileContents(giSysctlPath, "1\n") + helper.WriteCgroupFileContents("kubepods.slice/kubepods-podxxxxxx.slice/cri-containerd-yyyyyy.scope", sysutil.CPUProcs, "12344\n12345\n12346\n") + helper.WriteCgroupFileContents("kubepods.slice/kubepods-podxxxxxx.slice/cri-containerd-yyyyyy.scope", sysutil.CPUProcsV2, "12344\n12345\n12346\n") + }, + plugin: testGetEnabledPlugin(), + preparePluginFn: func(p *Plugin) { + f := p.cse.(*sysutil.FakeCoreSchedExtended) + f.SetNextCookieID(2000000) + p.cookieCache.SetDefault("group-xxx-expeller", newCookieCacheEntry(1000000, 1000, 1001, 1002)) + }, + cse: sysutil.NewFakeCoreSchedExtended(map[uint32]uint64{ + 1: 0, + 10: 0, + 1000: 1000000, + 1001: 1000000, + 1002: 1000000, + 12344: 0, + 12345: 0, + 12346: 0, + }, map[uint32]uint32{ + 1: 1, + 1000: 1000, + 1001: 1001, + 1002: 1001, + 12344: 12344, + 12345: 12344, + 12346: 12346, + }, map[uint32]bool{ + 12346: true, + }), + groupID: "group-xxx-expeller", + }, + arg: &protocol.ContainerContext{ + Request: protocol.ContainerRequest{ + PodMeta: protocol.PodMeta{ + Name: "test-pod", + UID: "xxxxxx", + }, + PodAnnotations: map[string]string{}, + PodLabels: map[string]string{ + extension.LabelPodQoS: string(extension.QoSLS), + slov1alpha1.LabelCoreSchedGroupID: "group-xxx", + }, + ContainerMeta: protocol.ContainerMeta{ + Name: "test-container", + ID: "containerd://yyyyyy", + }, + CgroupParent: "kubepods.slice/kubepods-podxxxxxx.slice/cri-containerd-yyyyyy.scope", + }, + }, + wantErr: false, + wantFields: wantFields{ + cookieToPIDs: map[uint64][]uint32{ + 1000000: { + 1000, + 1001, + 1002, + }, + }, + groupToCookie: map[string]uint64{ + "group-xxx-expeller": 1000000, + }, + }, + }, { name: "clear cookie for LS container correctly", fields: fields{ @@ -625,7 +886,8 @@ func TestPlugin_SetContainerCookie(t *testing.T) { PodAnnotations: map[string]string{}, PodLabels: map[string]string{ extension.LabelPodQoS: string(extension.QoSLS), - slov1alpha1.LabelCoreSchedGroupID: slov1alpha1.CoreSchedGroupIDNone, + slov1alpha1.LabelCoreSchedGroupID: "group-xxx", + slov1alpha1.LabelCoreSchedPolicy: string(slov1alpha1.CoreSchedPolicyNone), }, ContainerMeta: protocol.ContainerMeta{ Name: "test-container", @@ -698,7 +960,8 @@ func TestPlugin_SetContainerCookie(t *testing.T) { PodAnnotations: map[string]string{}, PodLabels: map[string]string{ extension.LabelPodQoS: string(extension.QoSLSR), - slov1alpha1.LabelCoreSchedGroupID: slov1alpha1.CoreSchedGroupIDNone, + slov1alpha1.LabelCoreSchedGroupID: "group-xxx", + slov1alpha1.LabelCoreSchedPolicy: string(slov1alpha1.CoreSchedPolicyNone), }, ContainerMeta: protocol.ContainerMeta{ Name: "test-container", @@ -766,7 +1029,8 @@ func TestPlugin_SetContainerCookie(t *testing.T) { PodAnnotations: map[string]string{}, PodLabels: map[string]string{ extension.LabelPodQoS: string(extension.QoSBE), - slov1alpha1.LabelCoreSchedGroupID: slov1alpha1.CoreSchedGroupIDNone, + slov1alpha1.LabelCoreSchedGroupID: "group-xxx", + slov1alpha1.LabelCoreSchedPolicy: string(slov1alpha1.CoreSchedPolicyNone), }, ContainerMeta: protocol.ContainerMeta{ Name: "test-container", @@ -826,7 +1090,8 @@ func TestPlugin_SetContainerCookie(t *testing.T) { PodAnnotations: map[string]string{}, PodLabels: map[string]string{ extension.LabelPodQoS: string(extension.QoSLS), - slov1alpha1.LabelCoreSchedGroupID: slov1alpha1.CoreSchedGroupIDNone, + slov1alpha1.LabelCoreSchedGroupID: "group-xxx", + slov1alpha1.LabelCoreSchedPolicy: string(slov1alpha1.CoreSchedPolicyNone), }, ContainerMeta: protocol.ContainerMeta{ Name: "test-container", @@ -886,7 +1151,8 @@ func TestPlugin_SetContainerCookie(t *testing.T) { PodAnnotations: map[string]string{}, PodLabels: map[string]string{ extension.LabelPodQoS: string(extension.QoSBE), - slov1alpha1.LabelCoreSchedGroupID: slov1alpha1.CoreSchedGroupIDNone, + slov1alpha1.LabelCoreSchedGroupID: "group-xxx", + slov1alpha1.LabelCoreSchedPolicy: string(slov1alpha1.CoreSchedPolicyNone), }, ContainerMeta: protocol.ContainerMeta{ Name: "test-container", @@ -913,6 +1179,8 @@ func TestPlugin_SetContainerCookie(t *testing.T) { name: "add cookie for LS container migrated between groups", fields: fields{ prepareFn: func(helper *sysutil.FileTestUtil) { + sysctlFeaturePath := sysutil.GetProcSysFilePath(sysutil.KernelSchedCore) + helper.WriteFileContents(sysctlFeaturePath, "1\n") helper.WriteCgroupFileContents("kubepods.slice/kubepods-podxxxxxx.slice/cri-containerd-yyyyyy.scope", sysutil.CPUProcs, "12344\n12345\n12346\n") helper.WriteCgroupFileContents("kubepods.slice/kubepods-podxxxxxx.slice/cri-containerd-yyyyyy.scope", sysutil.CPUProcsV2, "12344\n12345\n12346\n") }, @@ -1018,7 +1286,7 @@ func TestPlugin_SetContainerCookie(t *testing.T) { } } -func TestPlugin_LoadAllCookies(t *testing.T) { +func TestPlugin_loadAllCookies(t *testing.T) { type fields struct { prepareFn func(helper *sysutil.FileTestUtil) plugin *Plugin @@ -1816,7 +2084,7 @@ func TestPlugin_LoadAllCookies(t *testing.T) { tt.fields.preparePluginFn(p) } - got := p.LoadAllCookies(tt.arg) + got := p.loadAllCookies(tt.arg) assert.Equal(t, tt.want, got) for groupID, cookieID := range tt.wantFields.groupToCookie { if cookieID <= 0 { diff --git a/pkg/koordlet/runtimehooks/hooks/coresched/helper.go b/pkg/koordlet/runtimehooks/hooks/coresched/helper.go index 041404555..f98e68ec0 100644 --- a/pkg/koordlet/runtimehooks/hooks/coresched/helper.go +++ b/pkg/koordlet/runtimehooks/hooks/coresched/helper.go @@ -236,38 +236,24 @@ func (p *Plugin) clearCookie(pids []uint32, groupID string, lastCookieID uint64) // getPodEnabledAndGroup gets whether the pod enables the core scheduling and the group ID if it does. func (p *Plugin) getPodEnabledAndGroup(podAnnotations, podLabels map[string]string, podKubeQOS corev1.PodQOSClass, podUID string) (bool, string) { - // if the pod enables/disables the core-sched explicitly - groupID, isPodDisabled := slov1alpha1.GetCoreSchedGroupID(podLabels) - if isPodDisabled != nil && *isPodDisabled { // pod disables - return false, groupID - } - + groupID := slov1alpha1.GetCoreSchedGroupID(podLabels) + policy := slov1alpha1.GetCoreSchedPolicy(podLabels) podQOS := extension.QoSNone if podLabels != nil { podQOS = extension.GetQoSClassByAttrs(podLabels, podAnnotations) } - isQOSEnabled, isExpeller := p.rule.IsPodEnabled(podQOS, podKubeQOS) - groupID = p.getGroupID(groupID, podUID, isExpeller) - - if isPodDisabled != nil { // assert *isPodDisabled == true - return true, groupID - } + isEnabled, isExpeller := p.rule.IsPodEnabled(podQOS, podKubeQOS) - // use the QoS-level rules - return isQOSEnabled, groupID -} - -func (p *Plugin) getGroupID(baseGroupID string, podUID string, isExpeller bool) string { - var groupID string - if len(baseGroupID) > 0 { - groupID = baseGroupID - } else { + if policy == slov1alpha1.CoreSchedPolicyExclusive { groupID = podUID + } else if policy == slov1alpha1.CoreSchedPolicyNone { + isEnabled = false } if isExpeller { groupID += ExpellerGroupSuffix } - return groupID + + return isEnabled, groupID } func (p *Plugin) getContainerUID(podUID string, containerID string) string { diff --git a/pkg/koordlet/runtimehooks/hooks/coresched/helper_test.go b/pkg/koordlet/runtimehooks/hooks/coresched/helper_test.go index ac825158a..d2c52c141 100644 --- a/pkg/koordlet/runtimehooks/hooks/coresched/helper_test.go +++ b/pkg/koordlet/runtimehooks/hooks/coresched/helper_test.go @@ -722,7 +722,7 @@ func Test_isPodEnabled(t *testing.T) { want1 string }{ { - name: "pod enabled on annotation", + name: "pod enabled", field: field{ rule: testGetEnabledRule(), }, @@ -738,67 +738,140 @@ func Test_isPodEnabled(t *testing.T) { want1: "group-xxx-expeller", }, { - name: "pod enabled on annotation 1", + name: "BE pod enabled", field: field{ - rule: testGetDisabledRule(), + rule: testGetEnabledRule(), }, args: args{ podAnnotations: map[string]string{}, podLabels: map[string]string{ - slov1alpha1.LabelCoreSchedGroupID: "", + extension.LabelPodQoS: string(extension.QoSBE), }, podUID: "xxx", }, want: true, - want1: "xxx", + want1: "", }, { - name: "pod disabled on annotation", + name: "burstable pod enabled", field: field{ rule: testGetEnabledRule(), }, args: args{ podAnnotations: map[string]string{}, - podLabels: map[string]string{ - slov1alpha1.LabelCoreSchedGroupID: slov1alpha1.CoreSchedGroupIDNone, - }, - podUID: "xxx", + podLabels: map[string]string{}, + podKubeQOS: corev1.PodQOSBurstable, + podUID: "xxx", }, - want: false, - want1: slov1alpha1.CoreSchedGroupIDNone, + want: true, + want1: "-expeller", }, { - name: "pod enabled according to nodeSLO", + name: "pod enabled without group id label", field: field{ rule: testGetEnabledRule(), }, args: args{ - podKubeQOS: corev1.PodQOSBurstable, - podUID: "xxx", + podAnnotations: map[string]string{}, + podUID: "xxx", + podKubeQOS: corev1.PodQOSBurstable, }, want: true, - want1: "xxx-expeller", + want1: "-expeller", }, { - name: "pod enabled according to nodeSLO 1", + name: "pod enabled with policy exclusive", field: field{ rule: testGetEnabledRule(), }, args: args{ + podAnnotations: map[string]string{}, podLabels: map[string]string{ - extension.LabelPodQoS: string(extension.QoSLS), + extension.LabelPodQoS: string(extension.QoSLS), + slov1alpha1.LabelCoreSchedPolicy: string(slov1alpha1.CoreSchedPolicyExclusive), }, - podAnnotations: map[string]string{}, - podKubeQOS: corev1.PodQOSGuaranteed, - podUID: "xxx", + podUID: "xxx", }, want: true, want1: "xxx-expeller", }, { - name: "pod enabled according to nodeSLO 2", + name: "pod disabled", + field: field{ + rule: testGetDisabledRule(), + }, + args: args{ + podAnnotations: map[string]string{}, + podLabels: map[string]string{ + slov1alpha1.LabelCoreSchedGroupID: "group-xxx", + }, + podUID: "xxx", + }, + want: false, + want1: "group-xxx", + }, + { + name: "BE pod disabled", + field: field{ + rule: testGetDisabledRule(), + }, + args: args{ + podAnnotations: map[string]string{}, + podLabels: map[string]string{ + extension.LabelPodQoS: string(extension.QoSBE), + slov1alpha1.LabelCoreSchedGroupID: "group-xxx", + }, + podUID: "xxx", + }, + want: false, + want1: "group-xxx", + }, + { + name: "burstable pod disabled", + field: field{ + rule: testGetDisabledRule(), + }, + args: args{ + podAnnotations: map[string]string{}, + podLabels: map[string]string{}, + podKubeQOS: corev1.PodQOSBurstable, + podUID: "xxx", + }, + want: false, + want1: "", + }, + { + name: "pod disabled without group id label", + field: field{ + rule: testGetDisabledRule(), + }, + args: args{ + podLabels: map[string]string{}, + podKubeQOS: corev1.PodQOSBestEffort, + }, + want: false, + want1: "", + }, + { + name: "pod disabled by policy none", + field: field{ + rule: testGetEnabledRule(), + }, + args: args{ + podLabels: map[string]string{ + extension.LabelPodQoS: string(extension.QoSLS), + slov1alpha1.LabelCoreSchedPolicy: string(slov1alpha1.CoreSchedPolicyNone), + }, + podKubeQOS: corev1.PodQOSGuaranteed, + }, + want: false, + want1: "-expeller", + }, + { + name: "pod enabled according to nodeSLO", field: field{ rule: &Rule{ + enable: true, podQOSParams: map[extension.QoSClass]Param{ extension.QoSLSE: testGetDisabledRuleParam(), extension.QoSLSR: testGetDisabledRuleParam(), @@ -841,24 +914,13 @@ func Test_isPodEnabled(t *testing.T) { podUID: "xxx", }, want: true, - want1: "xxx", + want1: "", }, { name: "pod disabled according to nodeSLO", - field: field{ - rule: testGetDisabledRule(), - }, - args: args{ - podKubeQOS: corev1.PodQOSBestEffort, - podUID: "xxx", - }, - want: false, - want1: "xxx", - }, - { - name: "pod disabled according to nodeSLO 1", field: field{ rule: &Rule{ + enable: false, podQOSParams: map[extension.QoSClass]Param{ extension.QoSLSE: testGetDisabledRuleParam(), extension.QoSLSR: testGetDisabledRuleParam(), @@ -901,7 +963,7 @@ func Test_isPodEnabled(t *testing.T) { podUID: "xxx", }, want: false, - want1: "xxx", + want1: "", }, } for _, tt := range tests { diff --git a/pkg/koordlet/runtimehooks/hooks/coresched/rule.go b/pkg/koordlet/runtimehooks/hooks/coresched/rule.go index 8296c39cc..036535a21 100644 --- a/pkg/koordlet/runtimehooks/hooks/coresched/rule.go +++ b/pkg/koordlet/runtimehooks/hooks/coresched/rule.go @@ -49,12 +49,14 @@ func newParam(qosCfg *slov1alpha1.CPUQOSCfg, policy slov1alpha1.CPUQOSPolicy) Pa type Rule struct { lock sync.RWMutex + enable bool // node-level switch podQOSParams map[extension.QoSClass]Param kubeQOSPodParams map[corev1.PodQOSClass]Param } func newRule() *Rule { return &Rule{ + enable: false, podQOSParams: make(map[extension.QoSClass]Param), kubeQOSPodParams: make(map[corev1.PodQOSClass]Param), } @@ -66,10 +68,19 @@ func (r *Rule) IsInited() bool { return len(r.podQOSParams) > 0 && len(r.kubeQOSPodParams) > 0 } +func (r *Rule) IsEnabled() bool { + r.lock.RLock() + defer r.lock.RUnlock() + return r.enable +} + // IsPodEnabled returns if the pod's core sched is enabled by the rule, and if the QoS-level core expeller is enabled. func (r *Rule) IsPodEnabled(podQoSClass extension.QoSClass, podKubeQOS corev1.PodQOSClass) (bool, bool) { r.lock.RLock() defer r.lock.RUnlock() + if !r.enable { + return false, false + } if val, exist := r.podQOSParams[podQoSClass]; exist { return val.IsPodEnabled, val.IsExpeller } @@ -93,11 +104,13 @@ func (r *Rule) IsKubeQOSCPUIdle(KubeQOS corev1.PodQOSClass) bool { func (r *Rule) Update(ruleNew *Rule) bool { r.lock.Lock() defer r.lock.Unlock() - isEqual := reflect.DeepEqual(r.podQOSParams, ruleNew.podQOSParams) && + isEqual := r.enable == ruleNew.enable && + reflect.DeepEqual(r.podQOSParams, ruleNew.podQOSParams) && reflect.DeepEqual(r.kubeQOSPodParams, ruleNew.kubeQOSPodParams) if isEqual { return false } + r.enable = ruleNew.enable r.podQOSParams = ruleNew.podQOSParams r.kubeQOSPodParams = ruleNew.kubeQOSPodParams return true @@ -127,6 +140,7 @@ func (p *Plugin) parseRuleForNodeSLO(mergedNodeSLOIf interface{}) (bool, error) } ruleNew := &Rule{ + enable: lsrValue.IsPodEnabled || lsValue.IsPodEnabled || beValue.IsPodEnabled, podQOSParams: map[extension.QoSClass]Param{ extension.QoSLSE: lsrValue, extension.QoSLSR: lsrValue, @@ -172,9 +186,9 @@ func (p *Plugin) ruleUpdateCb(target *statesinformer.CallbackTarget) error { return nil } - // TBD: try to enable the kernel feature if needed - if supported, msg := p.SystemSupported(); !supported { - klog.V(4).Infof("plugin %s is not supported by system, msg: %s", name, msg) + // check the kernel feature and enable if needed + if !p.SystemSupported() { + klog.V(4).Infof("plugin %s is not supported by system, msg: %s", name, p.supportedMsg) return nil } @@ -184,7 +198,13 @@ func (p *Plugin) ruleUpdateCb(target *statesinformer.CallbackTarget) error { return nil } - if !p.InitCache(podMetas) { + if err := p.initSystem(p.rule.IsEnabled()); err != nil { + klog.Warningf("plugin %s failed to initialize system, err: %s", name, err) + return nil + } + klog.V(6).Infof("plugin %s initialize system successfully", name) + + if !p.initCache(podMetas) { klog.V(4).Infof("plugin %s aborted for cookie cache has not been initialized", name) return nil } diff --git a/pkg/koordlet/runtimehooks/hooks/coresched/rule_test.go b/pkg/koordlet/runtimehooks/hooks/coresched/rule_test.go index a7a1e9734..60f9eeb9f 100644 --- a/pkg/koordlet/runtimehooks/hooks/coresched/rule_test.go +++ b/pkg/koordlet/runtimehooks/hooks/coresched/rule_test.go @@ -268,6 +268,7 @@ func Test_parseRuleForNodeSLO(t *testing.T) { want: true, wantErr: false, wantField: &Rule{ + enable: true, podQOSParams: map[extension.QoSClass]Param{ extension.QoSLSE: { IsPodEnabled: false, @@ -313,6 +314,7 @@ func Test_parseRuleForNodeSLO(t *testing.T) { name: "policy enabled on BE", field: field{ rule: &Rule{ + enable: true, podQOSParams: map[extension.QoSClass]Param{ extension.QoSLSE: { IsPodEnabled: true, @@ -384,6 +386,7 @@ func Test_parseRuleForNodeSLO(t *testing.T) { want: true, wantErr: false, wantField: &Rule{ + enable: true, podQOSParams: map[extension.QoSClass]Param{ extension.QoSLSE: { IsPodEnabled: true, @@ -559,6 +562,72 @@ func Test_ruleUpdateCb(t *testing.T) { initialized: false, }, }, + { + name: "failed to init core sched via sysctl", + fields: fields{ + prepareFn: func(helper *sysutil.FileTestUtil) { + }, + plugin: newPlugin(), + preparePluginFn: func(p *Plugin) { + p.rule = testGetEnabledRule() + p.sysSupported = pointer.Bool(true) + }, + }, + arg: &statesinformer.CallbackTarget{ + Pods: []*statesinformer.PodMeta{ + { + CgroupDir: "kubepods.slice/kubepods-podxxxxxx.slice", + Pod: &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-pod", + UID: "xxxxxx", + Annotations: map[string]string{}, + Labels: map[string]string{ + extension.LabelPodQoS: string(extension.QoSLS), + slov1alpha1.LabelCoreSchedGroupID: "group-xxx", + }, + }, + Spec: corev1.PodSpec{ + Containers: []corev1.Container{ + { + Name: "test-container", + Resources: corev1.ResourceRequirements{ + Requests: corev1.ResourceList{ + corev1.ResourceCPU: resource.MustParse("2"), + corev1.ResourceMemory: resource.MustParse("4Gi"), + }, + Limits: corev1.ResourceList{ + corev1.ResourceCPU: resource.MustParse("2"), + corev1.ResourceMemory: resource.MustParse("4Gi"), + }, + }, + }, + }, + }, + Status: corev1.PodStatus{ + Phase: corev1.PodRunning, + QOSClass: corev1.PodQOSGuaranteed, + ContainerStatuses: []corev1.ContainerStatus{ + { + Name: "test-container", + ContainerID: "containerd://yyyyyy", + State: corev1.ContainerState{ + Running: &corev1.ContainerStateRunning{}, + }, + }, + }, + }, + }, + }, + }, + }, + wantErr: false, + wantFields: wantFields{ + rule: testGetEnabledRule(), + sysSupported: pointer.Bool(true), + initialized: false, + }, + }, { name: "no cookie has been synced", fields: fields{ @@ -1513,7 +1582,8 @@ func Test_ruleUpdateCb(t *testing.T) { Annotations: map[string]string{}, Labels: map[string]string{ extension.LabelPodQoS: string(extension.QoSLSR), - slov1alpha1.LabelCoreSchedGroupID: slov1alpha1.CoreSchedGroupIDNone, + slov1alpha1.LabelCoreSchedGroupID: "group-nnn", + slov1alpha1.LabelCoreSchedPolicy: string(slov1alpha1.CoreSchedPolicyNone), }, }, Spec: corev1.PodSpec{ @@ -1635,6 +1705,7 @@ func testGetDisabledRuleParam() Param { func testGetEnabledRule() *Rule { // use default CPUQOS return &Rule{ + enable: true, podQOSParams: map[extension.QoSClass]Param{ extension.QoSLSE: { IsPodEnabled: true, @@ -1680,6 +1751,7 @@ func testGetEnabledRule() *Rule { func testGetAllEnabledRule() *Rule { // use default CPUQOS and enable CPU Idle return &Rule{ + enable: true, podQOSParams: map[extension.QoSClass]Param{ extension.QoSLSE: { IsPodEnabled: true, @@ -1724,6 +1796,7 @@ func testGetAllEnabledRule() *Rule { func testGetDisabledRule() *Rule { return &Rule{ + enable: false, podQOSParams: map[extension.QoSClass]Param{ extension.QoSLSE: testGetDisabledRuleParam(), extension.QoSLSR: testGetDisabledRuleParam(), diff --git a/pkg/koordlet/runtimehooks/hooks/groupidentity/bvt.go b/pkg/koordlet/runtimehooks/hooks/groupidentity/bvt.go index 4c4f10d4f..37ec1872c 100644 --- a/pkg/koordlet/runtimehooks/hooks/groupidentity/bvt.go +++ b/pkg/koordlet/runtimehooks/hooks/groupidentity/bvt.go @@ -40,12 +40,14 @@ const ( ) type bvtPlugin struct { - rule *bvtRule - ruleRWMutex sync.RWMutex - sysSupported *bool - hasKernelEnabled *bool // whether kernel is configurable for enabling bvt (via `kernel.sched_group_identity_enabled`) - kernelEnabled *bool // if not nil, indicates whether bvt feature is enabled via `kernel.sched_group_identity_enabled` - executor resourceexecutor.ResourceUpdateExecutor + rule *bvtRule + ruleRWMutex sync.RWMutex + + sysSupported *bool + hasKernelEnabled *bool // whether kernel is configurable for enabling bvt (via `kernel.sched_group_identity_enabled`) + coreSchedSysctlSupported *bool // whether core sched is supported by the sysctl + + executor resourceexecutor.ResourceUpdateExecutor } func (b *bvtPlugin) Register(op hooks.Options) { @@ -71,8 +73,8 @@ func (b *bvtPlugin) SystemSupported() bool { if err == nil { isBVTSupported, msg = bvtResource.IsSupported(util.GetPodQoSRelativePath(corev1.PodQOSGuaranteed)) } - bvtConfigPath := sysutil.GetProcSysFilePath(sysutil.KernelSchedGroupIdentityEnable) - b.sysSupported = pointer.Bool(isBVTSupported || sysutil.FileExists(bvtConfigPath)) + sysSupported := isBVTSupported || sysutil.IsGroupIdentitySysctlSupported() + b.sysSupported = pointer.Bool(sysSupported) klog.Infof("update system supported info to %v for plugin %v, supported msg %s", *b.sysSupported, name, msg) } @@ -81,41 +83,66 @@ func (b *bvtPlugin) SystemSupported() bool { func (b *bvtPlugin) hasKernelEnable() bool { if b.hasKernelEnabled == nil { - bvtConfigPath := sysutil.GetProcSysFilePath(sysutil.KernelSchedGroupIdentityEnable) - b.hasKernelEnabled = pointer.Bool(sysutil.FileExists(bvtConfigPath)) + b.hasKernelEnabled = pointer.Bool(sysutil.IsGroupIdentitySysctlSupported()) } return *b.hasKernelEnabled } -// initKernelEnable checks and initializes the sysctl configuration for the bvt (group identity). -// It returns any error for the initialization. -func (b *bvtPlugin) initialize() error { - // NOTE: bvt (group identity) is supported and can be initialized in the system if: - // 1. anolis os kernel (<26.4): cgroup cpu.bvt_warp_ns exists but sysctl kernel.sched_group_identity_enabled no exist, - // the bvt feature is enabled by default, no need to set sysctl. - // 2. anolis os kernel (>=26.4): both cgroup cpu.bvt_warp_ns and sysctl kernel.sched_group_identity_enabled exist, - // the bvt feature is enabled when kernel.sched_group_identity_enabled is set as `1`. - if !b.hasKernelEnable() { // skip initialization of kernel does not support bvt sysctl +// tryDisableCoreSched tries disabling the core scheduling via sysctl to safely enable the group identity. +func (b *bvtPlugin) tryDisableCoreSched() error { + if b.coreSchedSysctlSupported == nil { + b.coreSchedSysctlSupported = pointer.Bool(sysutil.IsCoreSchedSysctlSupported()) + } + if !*b.coreSchedSysctlSupported { return nil } - // if cpu qos is enabled/disabled in rule, check if we need to change the sysctl config for bvt (group identity) - if b.kernelEnabled != nil && *b.kernelEnabled { - klog.V(6).Infof("skip initialize plugin %s, no need to change sysctl", name) + return sysutil.SetSchedCore(false) +} + +// initSysctl initializes the sysctl for the group identity. +// It returns whether cgroups need to set, e.g. if the sysctl config is not disabled. +func (b *bvtPlugin) initSysctl() error { + if !b.hasKernelEnable() { // kernel does not support bvt sysctl return nil } + // NOTE: Currently the kernel feature core scheduling is strictly excluded with the group identity's + // bvt=-1. So we have to check if the CoreSched can be disabled before enabling group identity. + err := b.tryDisableCoreSched() + if err != nil { + return err + } + // try to set bvt kernel enabled via sysctl when the sysctl config is disabled or unknown - err := sysutil.SetSchedGroupIdentity(true) + // https://github.com/koordinator-sh/koordinator/pull/1172 + err = sysutil.SetSchedGroupIdentity(true) if err != nil { - return fmt.Errorf("cannot enable kernel sysctl for bvt, err: %v", err) + return fmt.Errorf("cannot enable kernel sysctl for bvt, err: %w", err) } - b.kernelEnabled = pointer.Bool(true) - klog.V(4).Infof("hook plugin %s is successfully initialized", name) return nil } +// isSysctlEnabled checks if the sysctl configuration for the bvt (group identity) is enabled. +// It returns whether cgroups need to set, e.g. if the sysctl config is not disabled. +func (b *bvtPlugin) isSysctlEnabled() (bool, error) { + // NOTE: bvt (group identity) is supported and can be initialized in the system if: + // 1. anolis os kernel (<26.4): cgroup cpu.bvt_warp_ns exists but sysctl kernel.sched_group_identity_enabled no exist, + // the bvt feature is enabled by default, no need to set sysctl. + // 2. anolis os kernel (>=26.4): both cgroup cpu.bvt_warp_ns and sysctl kernel.sched_group_identity_enabled exist, + // the bvt feature is enabled when kernel.sched_group_identity_enabled is set as `1`. + if !b.hasKernelEnable() { // consider enabled when kernel does not support bvt sysctl + return true, nil + } + + isSysEnabled, err := sysutil.GetSchedGroupIdentity() + if err != nil { + return true, fmt.Errorf("cannot get sysctl group identity, err: %v", err) + } + return isSysEnabled, nil +} + var singleton *bvtPlugin func Object() *bvtPlugin { diff --git a/pkg/koordlet/runtimehooks/hooks/groupidentity/bvt_test.go b/pkg/koordlet/runtimehooks/hooks/groupidentity/bvt_test.go index 88d034c53..1d072a1ba 100644 --- a/pkg/koordlet/runtimehooks/hooks/groupidentity/bvt_test.go +++ b/pkg/koordlet/runtimehooks/hooks/groupidentity/bvt_test.go @@ -40,11 +40,6 @@ func initKernelGroupIdentity(value int64, helper *system.FileTestUtil) { helper.WriteProcSubFileContents(filepath.Join(system.SysctlSubDir, system.KernelSchedGroupIdentityEnable), strconv.FormatInt(value, 10)) } -func getKernelGroupIdentity(helper *system.FileTestUtil) (int64, error) { - valueStr := helper.ReadProcSubFileContents(filepath.Join(system.SysctlSubDir, system.KernelSchedGroupIdentityEnable)) - return strconv.ParseInt(valueStr, 10, 64) -} - func getPodCPUBvt(podDirWithKube string, helper *system.FileTestUtil) int64 { valueStr := helper.ReadCgroupFileContents(podDirWithKube, system.CPUBVTWarpNs) value, _ := strconv.ParseInt(valueStr, 10, 64) @@ -110,6 +105,15 @@ func Test_bvtPlugin_systemSupported(t *testing.T) { } } +func TestObject(t *testing.T) { + t.Run("test", func(t *testing.T) { + b := Object() + assert.NotNil(t, b) + b1 := Object() + assert.Equal(t, b, b1) + }) +} + func Test_bvtPlugin_Register(t *testing.T) { t.Run("register bvt plugin", func(t *testing.T) { b := &bvtPlugin{} @@ -117,106 +121,208 @@ func Test_bvtPlugin_Register(t *testing.T) { }) } -func Test_bvtPlugin_initialized(t *testing.T) { - kubeRootDir := util.GetPodQoSRelativePath(corev1.PodQOSGuaranteed) +func Test_bvtPlugin_initSysctl(t *testing.T) { type fields struct { - initPath *string - initKernelGroupIdentity bool - initKernelGroupIdentityValue int - rule *bvtRule - hasKernelEnable *bool - kernelEnabled *bool + prepareFn func(helper *system.FileTestUtil) + hasKernelEnabled *bool + coreSchedSysctlSupported *bool } tests := []struct { - name string - fields fields - wantErr bool - wantFields *int64 + name string + fields fields + wantErr bool + wantFn func(t *testing.T, helper *system.FileTestUtil) }{ { - name: "cannot initialize since system not support", - fields: fields{}, + name: "no need to init when sysctl not exist", + fields: fields{ + hasKernelEnabled: nil, + }, wantErr: false, }, { - name: "no need to initialize", + name: "only enable sysctl for group identity", fields: fields{ - hasKernelEnable: pointer.Bool(false), + prepareFn: func(helper *system.FileTestUtil) { + helper.WriteFileContents(system.GetProcSysFilePath(system.KernelSchedGroupIdentityEnable), "0") + }, + hasKernelEnabled: nil, }, wantErr: false, + wantFn: func(t *testing.T, helper *system.FileTestUtil) { + got := helper.ReadFileContents(system.GetProcSysFilePath(system.KernelSchedGroupIdentityEnable)) + assert.Equal(t, "1", got) + }, }, { - name: "failed to initialize", + name: "enable sysctl for group identity and disable core sched", fields: fields{ - hasKernelEnable: pointer.Bool(true), + prepareFn: func(helper *system.FileTestUtil) { + helper.WriteFileContents(system.GetProcSysFilePath(system.KernelSchedGroupIdentityEnable), "0") + helper.WriteFileContents(system.GetProcSysFilePath(system.KernelSchedCore), "1") + }, + hasKernelEnabled: nil, + }, + wantErr: false, + wantFn: func(t *testing.T, helper *system.FileTestUtil) { + got := helper.ReadFileContents(system.GetProcSysFilePath(system.KernelSchedGroupIdentityEnable)) + assert.Equal(t, "1", got) + got = helper.ReadFileContents(system.GetProcSysFilePath(system.KernelSchedCore)) + assert.Equal(t, "0", got) }, - wantErr: true, }, { - name: "initialized since only bvt file exist", + name: "skip enable sysctl for group identity 1", fields: fields{ - initPath: &kubeRootDir, + prepareFn: func(helper *system.FileTestUtil) { + helper.WriteFileContents(system.GetProcSysFilePath(system.KernelSchedGroupIdentityEnable), "1") + helper.WriteFileContents(system.GetProcSysFilePath(system.KernelSchedCore), "0") + }, + hasKernelEnabled: nil, }, wantErr: false, + wantFn: func(t *testing.T, helper *system.FileTestUtil) { + got := helper.ReadFileContents(system.GetProcSysFilePath(system.KernelSchedGroupIdentityEnable)) + assert.Equal(t, "1", got) + got = helper.ReadFileContents(system.GetProcSysFilePath(system.KernelSchedCore)) + assert.Equal(t, "0", got) + }, }, { - name: "initialized since bvt kernel file exist", + name: "failed to disable core sched", fields: fields{ - initKernelGroupIdentity: true, - initKernelGroupIdentityValue: 0, - rule: &bvtRule{ - enable: true, + prepareFn: func(helper *system.FileTestUtil) { + helper.WriteFileContents(system.GetProcSysFilePath(system.KernelSchedGroupIdentityEnable), "0") }, + hasKernelEnabled: pointer.Bool(true), + coreSchedSysctlSupported: pointer.Bool(true), + }, + wantErr: true, + wantFn: func(t *testing.T, helper *system.FileTestUtil) { + got := helper.ReadFileContents(system.GetProcSysFilePath(system.KernelSchedGroupIdentityEnable)) + assert.Equal(t, "0", got) }, - wantFields: pointer.Int64(1), }, { - name: "initialized since bvt kernel file exist 1", + name: "failed to disable sysctl for group identity", fields: fields{ - initKernelGroupIdentity: true, - initKernelGroupIdentityValue: 1, + prepareFn: func(helper *system.FileTestUtil) { + helper.WriteFileContents(system.GetProcSysFilePath(system.KernelSchedCore), "1") + }, + hasKernelEnabled: pointer.Bool(true), + }, + wantErr: true, + wantFn: func(t *testing.T, helper *system.FileTestUtil) { + got := helper.ReadFileContents(system.GetProcSysFilePath(system.KernelSchedCore)) + assert.Equal(t, "0", got) + }, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + helper := system.NewFileTestUtil(t) + defer helper.Cleanup() + if tt.fields.prepareFn != nil { + tt.fields.prepareFn(helper) + } + + b := &bvtPlugin{ + hasKernelEnabled: tt.fields.hasKernelEnabled, + coreSchedSysctlSupported: tt.fields.coreSchedSysctlSupported, + } + gotErr := b.initSysctl() + assert.Equal(t, tt.wantErr, gotErr != nil, gotErr) + if tt.wantFn != nil { + tt.wantFn(t, helper) + } + }) + } +} + +func Test_bvtPlugin_prepare(t *testing.T) { + kubeRootDir := util.GetPodQoSRelativePath(corev1.PodQOSGuaranteed) + type fields struct { + initPath *string + initKernelGroupIdentity bool + initKernelGroupIdentityValue int + rule *bvtRule + hasKernelEnable *bool + sysSupported *bool + } + tests := []struct { + name string + fields fields + want *bvtRule + wantFields *int64 + }{ + { + name: "cannot prepare since system not support", + fields: fields{}, + want: nil, + }, + { + name: "prepare successfully without sysctl", + fields: fields{ + initPath: &kubeRootDir, rule: &bvtRule{ - enable: true, + enable: false, }, + hasKernelEnable: pointer.Bool(false), + }, + want: &bvtRule{ + enable: false, + }, + }, + { + name: "failed to prepare since rule is empty", + fields: fields{ + sysSupported: pointer.Bool(true), + hasKernelEnable: pointer.Bool(true), }, - wantFields: pointer.Int64(1), + want: nil, }, { - name: "initialized since bvt kernel file exist 2", + name: "no need to prepare since rule and sysctl disabled", fields: fields{ - initPath: &kubeRootDir, initKernelGroupIdentity: true, initKernelGroupIdentityValue: 0, rule: &bvtRule{ - enable: true, + enable: false, }, + sysSupported: pointer.Bool(true), + hasKernelEnable: pointer.Bool(true), }, - wantFields: pointer.Int64(1), + want: nil, }, { - name: "not initialize since bvt file not exist and cpu qos disabled", + name: "need to prepare since sysctl enabled", fields: fields{ initKernelGroupIdentity: true, initKernelGroupIdentityValue: 1, rule: &bvtRule{ enable: false, }, + sysSupported: pointer.Bool(true), + hasKernelEnable: pointer.Bool(true), + }, + want: &bvtRule{ + enable: false, }, - wantFields: pointer.Int64(1), }, { - name: "skip sysctl set since bvt kernel enable not changed", + name: "need to prepare since rule enabled", fields: fields{ - initPath: &kubeRootDir, initKernelGroupIdentity: true, - initKernelGroupIdentityValue: 0, + initKernelGroupIdentityValue: 1, rule: &bvtRule{ enable: true, }, + sysSupported: pointer.Bool(true), hasKernelEnable: pointer.Bool(true), - kernelEnabled: pointer.Bool(true), }, - wantFields: pointer.Int64(0), + want: &bvtRule{ + enable: true, + }, }, } for _, tt := range tests { @@ -232,16 +338,10 @@ func Test_bvtPlugin_initialized(t *testing.T) { b := &bvtPlugin{ rule: tt.fields.rule, hasKernelEnabled: tt.fields.hasKernelEnable, - kernelEnabled: tt.fields.kernelEnabled, - } - gotErr := b.initialize() - assert.Equal(t, tt.wantErr, gotErr != nil) - - if tt.wantFields != nil { - got, err := getKernelGroupIdentity(testHelper) - assert.NoError(t, err) - assert.Equal(t, *tt.wantFields, got) + sysSupported: tt.fields.sysSupported, } + got := b.prepare() + assert.Equal(t, tt.want, got) }) } } diff --git a/pkg/koordlet/runtimehooks/hooks/groupidentity/interceptor.go b/pkg/koordlet/runtimehooks/hooks/groupidentity/interceptor.go index a589e8852..868695460 100644 --- a/pkg/koordlet/runtimehooks/hooks/groupidentity/interceptor.go +++ b/pkg/koordlet/runtimehooks/hooks/groupidentity/interceptor.go @@ -73,10 +73,16 @@ func (b *bvtPlugin) prepare() *bvtRule { klog.V(5).Infof("hook plugin rule is nil, nothing to do for plugin %v", name) return nil } - err := b.initialize() + + isSysctlEnabled, err := b.isSysctlEnabled() if err != nil { - klog.V(4).Infof("failed to initialize plugin %s, err: %s", name, err) + klog.V(4).Infof("failed to check sysctl for plugin %s, err: %s", name, err) + return nil + } + if !r.getEnable() && !isSysctlEnabled { // no need to update cgroups if both rule and sysctl disabled + klog.V(5).Infof("rule is disabled for plugin %v, no more to do for resources", name) return nil } + return r } diff --git a/pkg/koordlet/runtimehooks/hooks/groupidentity/interceptor_test.go b/pkg/koordlet/runtimehooks/hooks/groupidentity/interceptor_test.go index 44ee6fa52..6a1b6ab6a 100644 --- a/pkg/koordlet/runtimehooks/hooks/groupidentity/interceptor_test.go +++ b/pkg/koordlet/runtimehooks/hooks/groupidentity/interceptor_test.go @@ -68,6 +68,24 @@ func Test_bvtPlugin_SetPodBvtValue_Proxy(t *testing.T) { corev1.PodQOSBestEffort: 0, }, } + testRule1 := &bvtRule{ + enable: true, + podQOSParams: map[ext.QoSClass]int64{ + ext.QoSLSR: 0, + ext.QoSLS: 0, + ext.QoSBE: -1, + }, + kubeQOSDirParams: map[corev1.PodQOSClass]int64{ + corev1.PodQOSGuaranteed: 0, + corev1.PodQOSBurstable: 0, + corev1.PodQOSBestEffort: -1, + }, + kubeQOSPodParams: map[corev1.PodQOSClass]int64{ + corev1.PodQOSGuaranteed: 0, + corev1.PodQOSBurstable: 0, + corev1.PodQOSBestEffort: -1, + }, + } type fields struct { rule *bvtRule systemSupported *bool @@ -186,7 +204,7 @@ func Test_bvtPlugin_SetPodBvtValue_Proxy(t *testing.T) { }, }, { - name: "set guaranteed bvt none while kernel sysctl changed", + name: "no need to set guaranteed bvt none while kernel sysctl disabled", fields: fields{ rule: noneRule, systemSupported: pointer.Bool(true), @@ -202,6 +220,25 @@ func Test_bvtPlugin_SetPodBvtValue_Proxy(t *testing.T) { }, response: &runtimeapi.PodSandboxHookResponse{}, }, + want: want{}, + }, + { + name: "set guaranteed bvt none while kernel sysctl changed", + fields: fields{ + rule: testRule1, + systemSupported: pointer.Bool(true), + initKernelGroupIdentity: true, + initKernelGroupIdentityValue: 1, + }, + args: args{ + request: &runtimeapi.PodSandboxHookRequest{ + Labels: map[string]string{ + ext.LabelPodQoS: string(ext.QoSLS), + }, + CgroupParent: "kubepods/pod-guaranteed-test-uid/", + }, + response: &runtimeapi.PodSandboxHookResponse{}, + }, want: want{ bvtValue: pointer.Int64(0), }, @@ -320,6 +357,24 @@ func Test_bvtPlugin_SetKubeQOSBvtValue_Reconciler(t *testing.T) { corev1.PodQOSBestEffort: 0, }, } + testRule := &bvtRule{ + enable: true, + podQOSParams: map[ext.QoSClass]int64{ + ext.QoSLSR: 0, + ext.QoSLS: 0, + ext.QoSBE: -1, + }, + kubeQOSDirParams: map[corev1.PodQOSClass]int64{ + corev1.PodQOSGuaranteed: 0, + corev1.PodQOSBurstable: 0, + corev1.PodQOSBestEffort: -1, + }, + kubeQOSPodParams: map[corev1.PodQOSClass]int64{ + corev1.PodQOSGuaranteed: 0, + corev1.PodQOSBurstable: 0, + corev1.PodQOSBestEffort: -1, + }, + } type fields struct { rule *bvtRule sysSupported *bool @@ -420,7 +475,7 @@ func Test_bvtPlugin_SetKubeQOSBvtValue_Reconciler(t *testing.T) { }, }, { - name: "set guaranteed bvt none while kernel sysctl changed", + name: "no need to set guaranteed bvt none while kernel sysctl disabled", fields: fields{ rule: noneRule, sysSupported: pointer.Bool(true), @@ -430,6 +485,19 @@ func Test_bvtPlugin_SetKubeQOSBvtValue_Reconciler(t *testing.T) { args: args{ kubeQOS: corev1.PodQOSGuaranteed, }, + want: want{}, + }, + { + name: "set guaranteed bvt none while kernel sysctl changed", + fields: fields{ + rule: testRule, + sysSupported: pointer.Bool(true), + initKernelGroupIdentity: true, + initKernelGroupIdentityValue: 1, + }, + args: args{ + kubeQOS: corev1.PodQOSGuaranteed, + }, want: want{ bvtValue: pointer.Int64(0), }, @@ -506,9 +574,8 @@ func Test_bvtPlugin_SetHostAppBvtValue(t *testing.T) { }, } type fields struct { - rule *bvtRule - sysSupported *bool - kernelEnabled *bool + rule *bvtRule + sysSupported *bool } type args struct { qos ext.QoSClass @@ -523,9 +590,8 @@ func Test_bvtPlugin_SetHostAppBvtValue(t *testing.T) { { name: "set bvt value for ls host application", fields: fields{ - rule: defaultRule, - sysSupported: pointer.Bool(true), - kernelEnabled: pointer.Bool(true), + rule: defaultRule, + sysSupported: pointer.Bool(true), }, args: args{ qos: ext.QoSLS, @@ -536,9 +602,8 @@ func Test_bvtPlugin_SetHostAppBvtValue(t *testing.T) { { name: "set bvt value for none host application", fields: fields{ - rule: defaultRule, - sysSupported: pointer.Bool(true), - kernelEnabled: pointer.Bool(true), + rule: defaultRule, + sysSupported: pointer.Bool(true), }, args: args{ qos: ext.QoSNone, @@ -550,9 +615,8 @@ func Test_bvtPlugin_SetHostAppBvtValue(t *testing.T) { for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { b := &bvtPlugin{ - rule: tt.fields.rule, - sysSupported: tt.fields.sysSupported, - kernelEnabled: tt.fields.kernelEnabled, + rule: tt.fields.rule, + sysSupported: tt.fields.sysSupported, } ctx := &protocol.HostAppContext{} diff --git a/pkg/koordlet/runtimehooks/hooks/groupidentity/rule.go b/pkg/koordlet/runtimehooks/hooks/groupidentity/rule.go index 8a1bdae9e..d30c67db2 100644 --- a/pkg/koordlet/runtimehooks/hooks/groupidentity/rule.go +++ b/pkg/koordlet/runtimehooks/hooks/groupidentity/rule.go @@ -155,6 +155,31 @@ func (b *bvtPlugin) ruleUpdateCb(target *statesinformer.CallbackTarget) error { klog.V(5).Infof("hook plugin rule is nil, nothing to do for plugin %v", name) return nil } + + // check sysctl + // Currently, the kernel feature core scheduling is conflict to the group identity. So before we enable the + // group identity, we should check if the GroupIdentity can be enabled via sysctl and the CoreSched can be + // disabled via sysctl. And when we disable the group identity, we can check if the GroupIdentity is already + // disabled which means we do not need to update the cgroups. + isEnabled := r.getEnable() + if isEnabled { + if err := b.initSysctl(); err != nil { + klog.Warningf("failed to initialize system config for plugin %s, err: %s", name, err) + return nil + } + } else { + isSysctlEnabled, err := b.isSysctlEnabled() + if err != nil { + klog.Warningf("failed to check sysctl for plugin %s, err: %s", name, err) + return nil + } + if !r.getEnable() && !isSysctlEnabled { // no need to update cgroups if both rule and sysctl disabled + klog.V(4).Infof("rule is disabled for plugin %v, no more to do for resources", name) + return nil + } + } + + qosCgroupMap := map[string]struct{}{} for _, kubeQOS := range []corev1.PodQOSClass{ corev1.PodQOSGuaranteed, corev1.PodQOSBurstable, corev1.PodQOSBestEffort} { bvtValue := r.getKubeQOSDirBvtValue(kubeQOS) @@ -162,15 +187,38 @@ func (b *bvtPlugin) ruleUpdateCb(target *statesinformer.CallbackTarget) error { e := audit.V(3).Group(string(kubeQOS)).Reason(name).Message("set bvt to %v", bvtValue) bvtUpdater, err := resourceexecutor.DefaultCgroupUpdaterFactory.New(sysutil.CPUBVTWarpNsName, kubeQOSCgroupPath, strconv.FormatInt(bvtValue, 10), e) if err != nil { - klog.Infof("bvtupdater create failed, dir %v, error %v", kubeQOSCgroupPath, err) + klog.Infof("bvt updater create failed, dir %v, error %v", kubeQOSCgroupPath, err) } - if _, err := b.executor.Update(true, bvtUpdater); err != nil { + if _, err = b.executor.Update(true, bvtUpdater); err != nil { klog.Infof("update kube qos %v cpu bvt failed, dir %v, error %v", kubeQOS, kubeQOSCgroupPath, err) } + qosCgroupMap[kubeQOSCgroupPath] = struct{}{} } + if target == nil { return fmt.Errorf("callback target is nil") } + + // FIXME(saintube): Currently the kernel feature core scheduling is strictly excluded with the group identity's + // bvt=-1. So we have to check and disable all the BE cgroups' bvt for the GroupIdentity before creating the + // core sched cookies. To keep the consistency of the cgroup tree's configuration, we list and update the cgroups + // by the level order when the group identity is globally disabled. + // This check should be removed after the kernel provides a more stable interface. + podCgroupMap := map[string]int64{} + // pod-level + for _, kubeQOS := range []corev1.PodQOSClass{corev1.PodQOSGuaranteed, corev1.PodQOSBurstable, corev1.PodQOSBestEffort} { + bvtValue := r.getKubeQOSDirBvtValue(kubeQOS) + podCgroupDirs, err := koordletutil.GetCgroupPathsByTargetDepth(kubeQOS, sysutil.CPUBVTWarpNsName, koordletutil.PodCgroupPathRelativeDepth) + if err != nil { + return fmt.Errorf("get pod cgroup paths failed, qos %s, err: %w", kubeQOS, err) + } + for _, cgroupDir := range podCgroupDirs { + if _, ok := qosCgroupMap[cgroupDir]; ok { // exclude qos cgroup + continue + } + podCgroupMap[cgroupDir] = bvtValue + } + } for _, podMeta := range target.Pods { podQOS := ext.GetPodQoSClassRaw(podMeta.Pod) podKubeQOS := podMeta.Pod.Status.QOSClass @@ -179,12 +227,13 @@ func (b *bvtPlugin) ruleUpdateCb(target *statesinformer.CallbackTarget) error { e := audit.V(3).Pod(podMeta.Pod.Namespace, podMeta.Pod.Name).Reason(name).Message("set bvt to %v", podBvt) bvtUpdater, err := resourceexecutor.DefaultCgroupUpdaterFactory.New(sysutil.CPUBVTWarpNsName, podCgroupPath, strconv.FormatInt(podBvt, 10), e) if err != nil { - klog.Infof("bvtupdater create failed, dir %v, error %v", podCgroupPath, err) + klog.Infof("bvt updater create failed, dir %v, error %v", podCgroupPath, err) } - if _, err := b.executor.Update(true, bvtUpdater); err != nil { + if _, err = b.executor.Update(true, bvtUpdater); err != nil { klog.Infof("update pod %s cpu bvt failed, dir %v, error %v", util.GetPodKey(podMeta.Pod), podCgroupPath, err) } + delete(podCgroupMap, podCgroupPath) } for _, hostApp := range target.HostApplications { hostCtx := protocol.HooksProtocolBuilder.HostApp(&hostApp) @@ -195,6 +244,45 @@ func (b *bvtPlugin) ruleUpdateCb(target *statesinformer.CallbackTarget) error { klog.V(5).Infof("set host application %v bvt value finished", hostApp.Name) } } + // handle the remaining pod cgroups, which can belong to the dangling pods + for podCgroupDir, bvtValue := range podCgroupMap { + e := audit.V(3).Reason(name).Message("set bvt to %v", bvtValue) + bvtUpdater, err := resourceexecutor.DefaultCgroupUpdaterFactory.New(sysutil.CPUBVTWarpNsName, podCgroupDir, strconv.FormatInt(bvtValue, 10), e) + if err != nil { + klog.Infof("bvt updater create failed, dir %v, error %v", podCgroupDir, err) + } + if _, err = b.executor.Update(true, bvtUpdater); err != nil { + klog.Infof("update remaining pod cpu bvt failed, dir %v, error %v", podCgroupDir, err) + } + } + + // container-level + // NOTE: Although we do not set the container's cpu.bvt_warp_ns directly, it is inheritable from the pod-level, + // we have to handle the container-level only when we want to disable the group identity. + if !r.getEnable() { + for _, kubeQOS := range []corev1.PodQOSClass{corev1.PodQOSGuaranteed, corev1.PodQOSBurstable, corev1.PodQOSBestEffort} { + bvtValue := r.getKubeQOSDirBvtValue(kubeQOS) + containerCgroupDirs, err := koordletutil.GetCgroupPathsByTargetDepth(kubeQOS, sysutil.CPUBVTWarpNsName, koordletutil.ContainerCgroupPathRelativeDepth) + if err != nil { + return fmt.Errorf("get container cgroup paths failed, qos %s, err: %w", kubeQOS, err) + } + for _, cgroupDir := range containerCgroupDirs { + if _, ok := podCgroupMap[cgroupDir]; ok { + continue + } + + e := audit.V(3).Reason(name).Message("set bvt to %v", bvtValue) + bvtUpdater, err := resourceexecutor.DefaultCgroupUpdaterFactory.New(sysutil.CPUBVTWarpNsName, cgroupDir, strconv.FormatInt(bvtValue, 10), e) + if err != nil { + klog.Infof("bvt updater create failed, dir %v, error %v", cgroupDir, err) + } + if _, err = b.executor.Update(true, bvtUpdater); err != nil { + klog.Infof("update container cpu bvt failed, dir %v, error %v", cgroupDir, err) + } + } + } + } + return nil } diff --git a/pkg/koordlet/runtimehooks/hooks/groupidentity/rule_test.go b/pkg/koordlet/runtimehooks/hooks/groupidentity/rule_test.go index 94fda677e..030e6b7fd 100644 --- a/pkg/koordlet/runtimehooks/hooks/groupidentity/rule_test.go +++ b/pkg/koordlet/runtimehooks/hooks/groupidentity/rule_test.go @@ -604,8 +604,114 @@ func Test_bvtPlugin_parseRule(t *testing.T) { } func Test_bvtPlugin_ruleUpdateCbForPods(t *testing.T) { + testPodMetaMap := map[string]*statesinformer.PodMeta{ + "lsr-pod": { + Pod: &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: "lsr-pod", + Labels: map[string]string{ + ext.LabelPodQoS: string(ext.QoSLSR), + }, + }, + Status: corev1.PodStatus{ + QOSClass: corev1.PodQOSGuaranteed, + }, + }, + CgroupDir: "kubepods.slice/kubepods-test-lsr-pod.slice", + }, + "ls-pod1": { + Pod: &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: "ls-pod1", + Labels: map[string]string{ + ext.LabelPodQoS: string(ext.QoSLS), + }, + }, + Status: corev1.PodStatus{ + QOSClass: corev1.PodQOSGuaranteed, + }, + }, + CgroupDir: "kubepods.slice/kubepods-test-ls-pod1.slice", + }, + "ls-pod2": { + Pod: &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: "ls-pod2", + Labels: map[string]string{ + ext.LabelPodQoS: string(ext.QoSLS), + }, + }, + Status: corev1.PodStatus{ + QOSClass: corev1.PodQOSBurstable, + }, + }, + CgroupDir: "kubepods.slice/kubepods-burstable.slice/kubepods-test-ls-pod2.slice", + }, + "be-pod": { + Pod: &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: "be-pod", + Labels: map[string]string{ + ext.LabelPodQoS: string(ext.QoSBE), + }, + }, + Status: corev1.PodStatus{ + QOSClass: corev1.PodQOSBestEffort, + }, + }, + CgroupDir: "kubepods.slice/kubepods-besteffort.slice/kubepods-test-be-pod.slice", + }, + "guaranteed-pod": { + Pod: &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: "guaranteed-pod", + }, + Status: corev1.PodStatus{ + QOSClass: corev1.PodQOSGuaranteed, + }, + }, + CgroupDir: "kubepods.slice/kubepods-test-besteffort-pod.slice", + }, + "burstable-pod": { + Pod: &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: "burstable-pod", + }, + Status: corev1.PodStatus{ + QOSClass: corev1.PodQOSBurstable, + }, + }, + CgroupDir: "kubepods.slice/kubepods-burstable.slice/kubepods-test-burstable-pod.slice", + }, + "besteffort-pod": { + Pod: &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: "besteffort-pod", + }, + Spec: corev1.PodSpec{ + Containers: []corev1.Container{ + { + Name: "besteffort-container", + }, + }, + }, + Status: corev1.PodStatus{ + QOSClass: corev1.PodQOSBestEffort, + ContainerStatuses: []corev1.ContainerStatus{ + { + Name: "besteffort-container", + ContainerID: "containerd://xxxxxx", + }, + }, + }, + }, + CgroupDir: "kubepods.slice/kubepods-besteffort.slice/kubepods-test-besteffort-pod.slice", + }, + } type fields struct { - rule *bvtRule + rule *bvtRule + hasKernelEnabled *bool + prepareFn func(helper *system.FileTestUtil) } type args struct { pods map[string]*statesinformer.PodMeta @@ -617,11 +723,13 @@ func Test_bvtPlugin_ruleUpdateCbForPods(t *testing.T) { wantKubeDirVal map[corev1.PodQOSClass]int64 wantPodVal map[string]int64 wantErr bool + wantFn func(t *testing.T, helper *system.FileTestUtil) }{ { name: "callback with ls and be enabled", fields: fields{ rule: &bvtRule{ + enable: true, podQOSParams: map[ext.QoSClass]int64{ ext.QoSLSR: 0, ext.QoSLS: 2, @@ -638,153 +746,331 @@ func Test_bvtPlugin_ruleUpdateCbForPods(t *testing.T) { corev1.PodQOSBestEffort: -1, }, }, + prepareFn: func(helper *system.FileTestUtil) { + for _, kubeQoS := range []corev1.PodQOSClass{corev1.PodQOSGuaranteed, corev1.PodQOSBurstable, corev1.PodQOSBestEffort} { + initCPUBvt(util.GetPodQoSRelativePath(kubeQoS), 0, helper) + } + initCPUBvt(testPodMetaMap["lsr-pod"].CgroupDir, 0, helper) + initCPUBvt(testPodMetaMap["ls-pod1"].CgroupDir, 0, helper) + initCPUBvt(testPodMetaMap["ls-pod2"].CgroupDir, 0, helper) + initCPUBvt(testPodMetaMap["be-pod"].CgroupDir, 0, helper) + initCPUBvt(testPodMetaMap["guaranteed-pod"].CgroupDir, 0, helper) + initCPUBvt(testPodMetaMap["burstable-pod"].CgroupDir, 0, helper) + initCPUBvt(testPodMetaMap["besteffort-pod"].CgroupDir, 0, helper) + }, + }, + args: args{ + pods: testPodMetaMap, + }, + wantKubeDirVal: map[corev1.PodQOSClass]int64{ + corev1.PodQOSGuaranteed: int64(0), + corev1.PodQOSBurstable: int64(2), + corev1.PodQOSBestEffort: int64(-1), + }, + wantPodVal: map[string]int64{ + "lsr-pod": 0, + "ls-pod1": 2, + "ls-pod2": 2, + "be-pod": -1, + "guaranteed-pod": 2, + "burstable-pod": 2, + "besteffort-pod": -1, + }, + wantErr: false, + }, + { + name: "callback with ls and be disabled with dangling be pod", + fields: fields{ + rule: &bvtRule{ + enable: false, + podQOSParams: map[ext.QoSClass]int64{ + ext.QoSLSR: 0, + ext.QoSLS: 0, + ext.QoSBE: 0, + }, + kubeQOSDirParams: map[corev1.PodQOSClass]int64{ + corev1.PodQOSGuaranteed: 0, + corev1.PodQOSBurstable: 0, + corev1.PodQOSBestEffort: 0, + }, + kubeQOSPodParams: map[corev1.PodQOSClass]int64{ + corev1.PodQOSGuaranteed: 0, + corev1.PodQOSBurstable: 0, + corev1.PodQOSBestEffort: 0, + }, + }, + prepareFn: func(helper *system.FileTestUtil) { + cpuSet, _ := system.GetCgroupResource(system.CPUSetCPUSName) + helper.WriteCgroupFileContents(util.GetPodQoSRelativePath(corev1.PodQOSBestEffort), cpuSet, "0-63") + helper.WriteCgroupFileContents(testPodMetaMap["besteffort-pod"].CgroupDir, cpuSet, "0-63") + initCPUBvt(util.GetPodQoSRelativePath(corev1.PodQOSGuaranteed), 0, helper) + initCPUBvt(util.GetPodQoSRelativePath(corev1.PodQOSBurstable), 2, helper) + initCPUBvt(util.GetPodQoSRelativePath(corev1.PodQOSBestEffort), -1, helper) + initCPUBvt(testPodMetaMap["lsr-pod"].CgroupDir, 2, helper) + initCPUBvt(testPodMetaMap["ls-pod1"].CgroupDir, 2, helper) + initCPUBvt(testPodMetaMap["ls-pod2"].CgroupDir, 2, helper) + initCPUBvt(testPodMetaMap["be-pod"].CgroupDir, -1, helper) + initCPUBvt(testPodMetaMap["guaranteed-pod"].CgroupDir, 2, helper) + initCPUBvt(testPodMetaMap["burstable-pod"].CgroupDir, 2, helper) + initCPUBvt(testPodMetaMap["besteffort-pod"].CgroupDir, -1, helper) // dangling + }, }, args: args{ pods: map[string]*statesinformer.PodMeta{ - "lsr-pod": { - Pod: &corev1.Pod{ - ObjectMeta: metav1.ObjectMeta{ - Name: "lsr-pod", - Labels: map[string]string{ - ext.LabelPodQoS: string(ext.QoSLSR), - }, - }, - Status: corev1.PodStatus{ - QOSClass: corev1.PodQOSGuaranteed, - }, - }, - CgroupDir: "/kubepods-test-lsr-pod.slice", + "lsr-pod": testPodMetaMap["lsr-pod"], + "ls-pod1": testPodMetaMap["ls-pod1"], + "ls-pod2": testPodMetaMap["ls-pod2"], + "be-pod": testPodMetaMap["be-pod"], + "guaranteed-pod": testPodMetaMap["guaranteed-pod"], + "burstable-pod": testPodMetaMap["burstable-pod"], + // "besteffort-pod" is dangling + }, + }, + wantKubeDirVal: map[corev1.PodQOSClass]int64{ + corev1.PodQOSGuaranteed: int64(0), + corev1.PodQOSBurstable: int64(0), + corev1.PodQOSBestEffort: int64(0), + }, + wantPodVal: map[string]int64{ + "lsr-pod": 0, + "ls-pod1": 0, + "ls-pod2": 0, + "be-pod": 0, + "guaranteed-pod": 0, + "burstable-pod": 0, + }, + wantErr: false, + wantFn: func(t *testing.T, helper *system.FileTestUtil) { + // check dangling pod + got := helper.ReadCgroupFileContents(testPodMetaMap["besteffort-pod"].CgroupDir, system.CPUBVTWarpNs) + assert.Equal(t, "0", got) + }, + }, + { + name: "callback with ls and be disabled with be containers", + fields: fields{ + rule: &bvtRule{ + enable: false, + podQOSParams: map[ext.QoSClass]int64{ + ext.QoSLSR: 0, + ext.QoSLS: 0, + ext.QoSBE: 0, }, - "ls-pod1": { - Pod: &corev1.Pod{ - ObjectMeta: metav1.ObjectMeta{ - Name: "ls-pod1", - Labels: map[string]string{ - ext.LabelPodQoS: string(ext.QoSLS), - }, - }, - Status: corev1.PodStatus{ - QOSClass: corev1.PodQOSGuaranteed, - }, - }, - CgroupDir: "/kubepods-test-ls-pod1.slice", + kubeQOSDirParams: map[corev1.PodQOSClass]int64{ + corev1.PodQOSGuaranteed: 0, + corev1.PodQOSBurstable: 0, + corev1.PodQOSBestEffort: 0, }, - "ls-pod2": { - Pod: &corev1.Pod{ - ObjectMeta: metav1.ObjectMeta{ - Name: "ls-pod2", - Labels: map[string]string{ - ext.LabelPodQoS: string(ext.QoSLS), - }, - }, - Status: corev1.PodStatus{ - QOSClass: corev1.PodQOSBurstable, - }, - }, - CgroupDir: "/kubepods-burstable.slice/kubepods-test-ls-pod2.slice", + kubeQOSPodParams: map[corev1.PodQOSClass]int64{ + corev1.PodQOSGuaranteed: 0, + corev1.PodQOSBurstable: 0, + corev1.PodQOSBestEffort: 0, }, - "be-pod": { - Pod: &corev1.Pod{ - ObjectMeta: metav1.ObjectMeta{ - Name: "be-pod", - Labels: map[string]string{ - ext.LabelPodQoS: string(ext.QoSBE), - }, - }, - Status: corev1.PodStatus{ - QOSClass: corev1.PodQOSBestEffort, - }, - }, - CgroupDir: "/kubepods-besteffort.slice/kubepods-test-be-pod.slice", + }, + prepareFn: func(helper *system.FileTestUtil) { + cpuSet, _ := system.GetCgroupResource(system.CPUSetCPUSName) + helper.WriteCgroupFileContents(util.GetPodQoSRelativePath(corev1.PodQOSBestEffort), cpuSet, "0-63") + helper.WriteCgroupFileContents(testPodMetaMap["besteffort-pod"].CgroupDir, cpuSet, "0-63") + besteffortContainerDir := testPodMetaMap["besteffort-pod"].CgroupDir + "/cri-containerd-xxxxxx.scope" + helper.WriteCgroupFileContents(besteffortContainerDir, cpuSet, "0-63") + initCPUBvt(util.GetPodQoSRelativePath(corev1.PodQOSGuaranteed), 0, helper) + initCPUBvt(util.GetPodQoSRelativePath(corev1.PodQOSBurstable), 2, helper) + initCPUBvt(util.GetPodQoSRelativePath(corev1.PodQOSBestEffort), -1, helper) + initCPUBvt(testPodMetaMap["lsr-pod"].CgroupDir, 2, helper) + initCPUBvt(testPodMetaMap["ls-pod1"].CgroupDir, 2, helper) + initCPUBvt(testPodMetaMap["ls-pod2"].CgroupDir, 2, helper) + initCPUBvt(testPodMetaMap["be-pod"].CgroupDir, -1, helper) + initCPUBvt(testPodMetaMap["guaranteed-pod"].CgroupDir, 2, helper) + initCPUBvt(testPodMetaMap["burstable-pod"].CgroupDir, 2, helper) + initCPUBvt(testPodMetaMap["besteffort-pod"].CgroupDir, -1, helper) + initCPUBvt(besteffortContainerDir, -1, helper) + }, + }, + args: args{ + pods: map[string]*statesinformer.PodMeta{ + "lsr-pod": testPodMetaMap["lsr-pod"], + "ls-pod1": testPodMetaMap["ls-pod1"], + "ls-pod2": testPodMetaMap["ls-pod2"], + "be-pod": testPodMetaMap["be-pod"], + "guaranteed-pod": testPodMetaMap["guaranteed-pod"], + "burstable-pod": testPodMetaMap["burstable-pod"], + "besteffort-pod": testPodMetaMap["besteffort-pod"], + }, + }, + wantKubeDirVal: map[corev1.PodQOSClass]int64{ + corev1.PodQOSGuaranteed: int64(0), + corev1.PodQOSBurstable: int64(0), + corev1.PodQOSBestEffort: int64(0), + }, + wantPodVal: map[string]int64{ + "lsr-pod": 0, + "ls-pod1": 0, + "ls-pod2": 0, + "be-pod": 0, + "guaranteed-pod": 0, + "burstable-pod": 0, + "besteffort-pod": 0, + }, + wantErr: false, + wantFn: func(t *testing.T, helper *system.FileTestUtil) { + // check be container + besteffortContainerDir := testPodMetaMap["besteffort-pod"].CgroupDir + "/cri-containerd-xxxxxx.scope" + got := helper.ReadCgroupFileContents(besteffortContainerDir, system.CPUBVTWarpNs) + assert.Equal(t, "0", got) + }, + }, + { + name: "callback with ls and be enabled but init sysctl failed", + fields: fields{ + rule: &bvtRule{ + enable: true, + podQOSParams: map[ext.QoSClass]int64{ + ext.QoSLSR: 0, + ext.QoSLS: 2, + ext.QoSBE: -1, }, - "guaranteed-pod": { - Pod: &corev1.Pod{ - ObjectMeta: metav1.ObjectMeta{ - Name: "guaranteed-pod", - }, - Status: corev1.PodStatus{ - QOSClass: corev1.PodQOSGuaranteed, - }, - }, - CgroupDir: "/kubepods-test-besteffort-pod.slice", + kubeQOSDirParams: map[corev1.PodQOSClass]int64{ + corev1.PodQOSGuaranteed: 0, + corev1.PodQOSBurstable: 2, + corev1.PodQOSBestEffort: -1, }, - "burstable-pod": { - Pod: &corev1.Pod{ - ObjectMeta: metav1.ObjectMeta{ - Name: "burstable-pod", - }, - Status: corev1.PodStatus{ - QOSClass: corev1.PodQOSBurstable, - }, - }, - CgroupDir: "/kubepods-burstable.slice/kubepods-test-burstable-pod.slice", + kubeQOSPodParams: map[corev1.PodQOSClass]int64{ + corev1.PodQOSGuaranteed: 2, + corev1.PodQOSBurstable: 2, + corev1.PodQOSBestEffort: -1, }, - "besteffort-pod": { - Pod: &corev1.Pod{ - ObjectMeta: metav1.ObjectMeta{ - Name: "besteffort-pod", - }, - Status: corev1.PodStatus{ - QOSClass: corev1.PodQOSBestEffort, - }, - }, - CgroupDir: "/kubepods-besteffort.slice/kubepods-test-besteffort-pod.slice", + }, + hasKernelEnabled: pointer.Bool(true), + prepareFn: func(helper *system.FileTestUtil) { + for _, kubeQoS := range []corev1.PodQOSClass{corev1.PodQOSGuaranteed, corev1.PodQOSBurstable, corev1.PodQOSBestEffort} { + initCPUBvt(util.GetPodQoSRelativePath(kubeQoS), 0, helper) + } + initCPUBvt(testPodMetaMap["lsr-pod"].CgroupDir, 0, helper) + initCPUBvt(testPodMetaMap["ls-pod1"].CgroupDir, 0, helper) + initCPUBvt(testPodMetaMap["ls-pod2"].CgroupDir, 0, helper) + initCPUBvt(testPodMetaMap["be-pod"].CgroupDir, 0, helper) + initCPUBvt(testPodMetaMap["guaranteed-pod"].CgroupDir, 0, helper) + initCPUBvt(testPodMetaMap["burstable-pod"].CgroupDir, 0, helper) + initCPUBvt(testPodMetaMap["besteffort-pod"].CgroupDir, 0, helper) + }, + }, + args: args{ + pods: testPodMetaMap, + }, + wantKubeDirVal: map[corev1.PodQOSClass]int64{ + corev1.PodQOSGuaranteed: int64(0), + corev1.PodQOSBurstable: int64(0), + corev1.PodQOSBestEffort: int64(0), + }, + wantPodVal: map[string]int64{ + "lsr-pod": 0, + "ls-pod1": 0, + "ls-pod2": 0, + "be-pod": 0, + "guaranteed-pod": 0, + "burstable-pod": 0, + "besteffort-pod": 0, + }, + wantErr: false, + }, + { + name: "callback with ls and be disabled and sysctl disabled", + fields: fields{ + rule: &bvtRule{ + enable: false, + podQOSParams: map[ext.QoSClass]int64{ + ext.QoSLSR: 0, + ext.QoSLS: 0, + ext.QoSBE: 0, + }, + kubeQOSDirParams: map[corev1.PodQOSClass]int64{ + corev1.PodQOSGuaranteed: 0, + corev1.PodQOSBurstable: 0, + corev1.PodQOSBestEffort: 0, }, + kubeQOSPodParams: map[corev1.PodQOSClass]int64{ + corev1.PodQOSGuaranteed: 0, + corev1.PodQOSBurstable: 0, + corev1.PodQOSBestEffort: 0, + }, + }, + hasKernelEnabled: pointer.Bool(true), + prepareFn: func(helper *system.FileTestUtil) { + initKernelGroupIdentity(0, helper) + for _, kubeQoS := range []corev1.PodQOSClass{corev1.PodQOSGuaranteed, corev1.PodQOSBurstable, corev1.PodQOSBestEffort} { + initCPUBvt(util.GetPodQoSRelativePath(kubeQoS), 0, helper) + } + initCPUBvt(testPodMetaMap["lsr-pod"].CgroupDir, 0, helper) + initCPUBvt(testPodMetaMap["ls-pod1"].CgroupDir, 0, helper) + initCPUBvt(testPodMetaMap["ls-pod2"].CgroupDir, 0, helper) + initCPUBvt(testPodMetaMap["be-pod"].CgroupDir, 0, helper) + initCPUBvt(testPodMetaMap["guaranteed-pod"].CgroupDir, 0, helper) + initCPUBvt(testPodMetaMap["burstable-pod"].CgroupDir, 0, helper) + initCPUBvt(testPodMetaMap["besteffort-pod"].CgroupDir, 0, helper) }, }, + args: args{ + pods: testPodMetaMap, + }, wantKubeDirVal: map[corev1.PodQOSClass]int64{ corev1.PodQOSGuaranteed: int64(0), - corev1.PodQOSBurstable: int64(2), - corev1.PodQOSBestEffort: int64(-1), + corev1.PodQOSBurstable: int64(0), + corev1.PodQOSBestEffort: int64(0), }, wantPodVal: map[string]int64{ "lsr-pod": 0, - "ls-pod1": 2, - "ls-pod2": 2, - "be-pod": -1, - "guaranteed-pod": 2, - "burstable-pod": 2, - "besteffort-pod": -1, + "ls-pod1": 0, + "ls-pod2": 0, + "be-pod": 0, + "guaranteed-pod": 0, + "burstable-pod": 0, + "besteffort-pod": 0, }, wantErr: false, }, } for _, tt := range tests { - testHelper := system.NewFileTestUtil(t) - for _, kubeQoS := range []corev1.PodQOSClass{corev1.PodQOSGuaranteed, corev1.PodQOSBurstable, corev1.PodQOSBestEffort} { - initCPUBvt(util.GetPodQoSRelativePath(kubeQoS), 0, testHelper) - } - podList := make([]*statesinformer.PodMeta, 0, len(tt.args.pods)) - for _, pod := range tt.args.pods { - initCPUBvt(pod.CgroupDir, 0, testHelper) - podList = append(podList, pod) - } - target := &statesinformer.CallbackTarget{ - Pods: podList, - } t.Run(tt.name, func(t *testing.T) { + testHelper := system.NewFileTestUtil(t) + defer testHelper.Cleanup() + if tt.fields.prepareFn != nil { + tt.fields.prepareFn(testHelper) + } + + podList := make([]*statesinformer.PodMeta, 0, len(tt.args.pods)) + for _, pod := range tt.args.pods { + podList = append(podList, pod) + } + target := &statesinformer.CallbackTarget{ + Pods: podList, + } b := &bvtPlugin{ - rule: tt.fields.rule, - executor: resourceexecutor.NewResourceUpdateExecutor(), + rule: tt.fields.rule, + hasKernelEnabled: tt.fields.hasKernelEnabled, + executor: resourceexecutor.NewResourceUpdateExecutor(), } stop := make(chan struct{}) - defer func() { close(stop) }() + defer close(stop) b.executor.Run(stop) - if err := b.ruleUpdateCb(target); (err != nil) != tt.wantErr { - t.Errorf("ruleUpdateCb() error = %v, wantErr %v", err, tt.wantErr) - } + err := b.ruleUpdateCb(target) + assert.Equal(t, tt.wantErr, err != nil, err) for kubeQoS, wantBvt := range tt.wantKubeDirVal { gotBvtStr := testHelper.ReadCgroupFileContents(util.GetPodQoSRelativePath(kubeQoS), system.CPUBVTWarpNs) - gotBvt, _ := strconv.ParseInt(gotBvtStr, 10, 64) + gotBvt, err := strconv.ParseInt(gotBvtStr, 10, 64) + assert.NoError(t, err) assert.Equal(t, wantBvt, gotBvt, "qos %s bvt value not equal", kubeQoS) } for podName, pod := range tt.args.pods { gotBvtStr := testHelper.ReadCgroupFileContents(pod.CgroupDir, system.CPUBVTWarpNs) - gotBvt, _ := strconv.ParseInt(gotBvtStr, 10, 64) + gotBvt, err := strconv.ParseInt(gotBvtStr, 10, 64) + assert.NoError(t, err) wantBvt := tt.wantPodVal[podName] assert.Equal(t, wantBvt, gotBvt, "pod %s bvt value not equal", podName) } + if tt.wantFn != nil { + tt.wantFn(t, testHelper) + } }) } } @@ -807,6 +1093,7 @@ func Test_bvtPlugin_ruleUpdateCbForHostApp(t *testing.T) { name: "set host application bvt", fields: fields{ rule: &bvtRule{ + enable: true, podQOSParams: map[ext.QoSClass]int64{ ext.QoSLSR: 0, ext.QoSLS: 2, @@ -841,6 +1128,7 @@ func Test_bvtPlugin_ruleUpdateCbForHostApp(t *testing.T) { for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { testHelper := system.NewFileTestUtil(t) + defer testHelper.Cleanup() testApp := tt.args.hostApp if testApp.CgroupPath == nil || (testApp.CgroupPath.Base != "" && testApp.CgroupPath.Base != slov1alpha1.CgroupBaseTypeRoot) { @@ -849,25 +1137,28 @@ func Test_bvtPlugin_ruleUpdateCbForHostApp(t *testing.T) { cgroupDir := filepath.Join(testApp.CgroupPath.ParentDir, testApp.CgroupPath.RelativePath) initCPUBvt(cgroupDir, 0, testHelper) + initCPUBvt(util.GetPodQoSRelativePath(corev1.PodQOSGuaranteed), 0, testHelper) + initCPUBvt(util.GetPodQoSRelativePath(corev1.PodQOSBurstable), 0, testHelper) + initCPUBvt(util.GetPodQoSRelativePath(corev1.PodQOSBestEffort), 0, testHelper) + b := &bvtPlugin{ - rule: tt.fields.rule, - sysSupported: pointer.Bool(true), - kernelEnabled: pointer.Bool(true), - executor: resourceexecutor.NewResourceUpdateExecutor(), + rule: tt.fields.rule, + sysSupported: pointer.Bool(true), + executor: resourceexecutor.NewResourceUpdateExecutor(), } stop := make(chan struct{}) - defer func() { close(stop) }() + defer close(stop) b.executor.Run(stop) target := &statesinformer.CallbackTarget{ HostApplications: []slov1alpha1.HostApplicationSpec{tt.args.hostApp}, } - if err := b.ruleUpdateCb(target); (err != nil) != tt.wantErr { - t.Errorf("ruleUpdateCb() error = %v, wantErr %v", err, tt.wantErr) - } + err := b.ruleUpdateCb(target) + assert.Equal(t, tt.wantErr, err != nil, err) gotBvtStr := testHelper.ReadCgroupFileContents(cgroupDir, system.CPUBVTWarpNs) - gotBvt, _ := strconv.ParseInt(gotBvtStr, 10, 64) + gotBvt, err := strconv.ParseInt(gotBvtStr, 10, 64) + assert.NoError(t, err) assert.Equal(t, tt.wantBvt, gotBvt) }) } diff --git a/pkg/koordlet/util/node.go b/pkg/koordlet/util/node.go index 2cf78fc91..5408c7928 100644 --- a/pkg/koordlet/util/node.go +++ b/pkg/koordlet/util/node.go @@ -17,6 +17,7 @@ limitations under the License. package util import ( + "fmt" "os" "path/filepath" "strings" @@ -91,16 +92,21 @@ func GetBECPUSetPathsByMaxDepth(relativeDepth int) ([]string, error) { return paths, err } -// GetBECPUSetPathsByTargetDepth only gets the be containers' cpuset groups' paths -func GetBECPUSetPathsByTargetDepth(relativeDepth int) ([]string, error) { - rootCgroupPath := GetRootCgroupCPUSetDir(corev1.PodQOSBestEffort) - rootCPUSetSubfsPath := system.GetRootCgroupSubfsDir(system.CgroupCPUSetDir) - _, err := os.Stat(rootCgroupPath) +func GetCgroupPathsByTargetDepth(qos corev1.PodQOSClass, resourceType system.ResourceType, relativeDepth int) ([]string, error) { + rootCgroupParentDir := GetPodQoSRelativePath(qos) + r, err := system.GetCgroupResource(resourceType) + if err != nil { + return nil, fmt.Errorf("get resource type failed, err: %w", err) + } + cgroupResource := r.(*system.CgroupResource) + rootCgroupPath := filepath.Dir(r.Path(rootCgroupParentDir)) + rootSubfsPath := system.GetRootCgroupSubfsDir(cgroupResource.Subfs) + _, err = os.Stat(rootCgroupPath) if err != nil { // make sure the rootCgroupPath is available return nil, err } - klog.V(6).Infof("get be rootCgroupPath: %v", rootCgroupPath) + klog.V(6).Infof("get rootCgroupPath, qos %s, resource %s, path: %s", qos, resourceType, rootCgroupPath) absDepth := strings.Count(rootCgroupPath, string(os.PathSeparator)) + relativeDepth var containerPaths []string @@ -110,7 +116,7 @@ func GetBECPUSetPathsByTargetDepth(relativeDepth int) ([]string, error) { } if info.IsDir() && strings.Count(path, string(os.PathSeparator)) == absDepth { // get the path of parentDir - parentDir, err1 := filepath.Rel(rootCPUSetSubfsPath, path) + parentDir, err1 := filepath.Rel(rootSubfsPath, path) if err1 != nil { return err1 } @@ -121,6 +127,11 @@ func GetBECPUSetPathsByTargetDepth(relativeDepth int) ([]string, error) { return containerPaths, err } +// GetBECPUSetPathsByTargetDepth only gets the be containers' cpuset groups' paths +func GetBECPUSetPathsByTargetDepth(relativeDepth int) ([]string, error) { + return GetCgroupPathsByTargetDepth(corev1.PodQOSBestEffort, system.CPUSetCPUSName, relativeDepth) +} + // GetCgroupRootBlkIOAbsoluteDir gets the root blkio directory // @output /sys/fs/cgroup/blkio func GetCgroupRootBlkIOAbsoluteDir() string { diff --git a/pkg/koordlet/util/node_test.go b/pkg/koordlet/util/node_test.go index 3e8176b40..2bd94b66b 100644 --- a/pkg/koordlet/util/node_test.go +++ b/pkg/koordlet/util/node_test.go @@ -17,7 +17,7 @@ limitations under the License. package util import ( - "path" + "path/filepath" "testing" "github.com/stretchr/testify/assert" @@ -28,21 +28,216 @@ import ( func Test_GetKubeQosRelativePath(t *testing.T) { guaranteedPathSystemd := GetPodQoSRelativePath(corev1.PodQOSGuaranteed) - assert.Equal(t, path.Clean(system.KubeRootNameSystemd), guaranteedPathSystemd) + assert.Equal(t, filepath.Clean(system.KubeRootNameSystemd), guaranteedPathSystemd) burstablePathSystemd := GetPodQoSRelativePath(corev1.PodQOSBurstable) - assert.Equal(t, path.Join(system.KubeRootNameSystemd, system.KubeBurstableNameSystemd), burstablePathSystemd) + assert.Equal(t, filepath.Join(system.KubeRootNameSystemd, system.KubeBurstableNameSystemd), burstablePathSystemd) besteffortPathSystemd := GetPodQoSRelativePath(corev1.PodQOSBestEffort) - assert.Equal(t, path.Join(system.KubeRootNameSystemd, system.KubeBesteffortNameSystemd), besteffortPathSystemd) + assert.Equal(t, filepath.Join(system.KubeRootNameSystemd, system.KubeBesteffortNameSystemd), besteffortPathSystemd) system.SetupCgroupPathFormatter(system.Cgroupfs) guaranteedPathCgroupfs := GetPodQoSRelativePath(corev1.PodQOSGuaranteed) - assert.Equal(t, path.Clean(system.KubeRootNameCgroupfs), guaranteedPathCgroupfs) + assert.Equal(t, filepath.Clean(system.KubeRootNameCgroupfs), guaranteedPathCgroupfs) burstablePathCgroupfs := GetPodQoSRelativePath(corev1.PodQOSBurstable) - assert.Equal(t, path.Join(system.KubeRootNameCgroupfs, system.KubeBurstableNameCgroupfs), burstablePathCgroupfs) + assert.Equal(t, filepath.Join(system.KubeRootNameCgroupfs, system.KubeBurstableNameCgroupfs), burstablePathCgroupfs) besteffortPathCgroupfs := GetPodQoSRelativePath(corev1.PodQOSBestEffort) - assert.Equal(t, path.Join(system.KubeRootNameCgroupfs, system.KubeBesteffortNameCgroupfs), besteffortPathCgroupfs) + assert.Equal(t, filepath.Join(system.KubeRootNameCgroupfs, system.KubeBesteffortNameCgroupfs), besteffortPathCgroupfs) +} + +func TestGetCgroupPathsByTargetDepth(t *testing.T) { + type fields struct { + prepareFn func(helper *system.FileTestUtil) + } + type args struct { + qos corev1.PodQOSClass + resourceType system.ResourceType + relativeDepth int + } + tests := []struct { + name string + fields fields + args args + want []string + wantErr bool + }{ + { + name: "get cgroup resource failed", + args: args{ + qos: corev1.PodQOSGuaranteed, + resourceType: "unknown_resource_xxx", + relativeDepth: 1, + }, + wantErr: true, + }, + { + name: "get root cgroup path failed", + args: args{ + qos: corev1.PodQOSGuaranteed, + resourceType: system.CPUSetCPUSName, + relativeDepth: 1, + }, + wantErr: true, + }, + { + name: "get qos-level cgroup path successfully", + fields: fields{ + prepareFn: func(helper *system.FileTestUtil) { + helper.SetCgroupsV2(false) + cpuset, _ := system.GetCgroupResource(system.CPUSetCPUSName) + qosCgroupDir := GetPodQoSRelativePath(corev1.PodQOSGuaranteed) + helper.WriteCgroupFileContents(qosCgroupDir, cpuset, "0-63") + }, + }, + args: args{ + qos: corev1.PodQOSGuaranteed, + resourceType: system.CPUSetCPUSName, + relativeDepth: 0, + }, + wantErr: false, + want: []string{ + GetPodQoSRelativePath(corev1.PodQOSGuaranteed), + }, + }, + { + name: "get pod-level cgroup path successfully", + fields: fields{ + prepareFn: func(helper *system.FileTestUtil) { + helper.SetCgroupsV2(false) + cpuset, _ := system.GetCgroupResource(system.CPUSetCPUSName) + qosCgroupDir := GetPodQoSRelativePath(corev1.PodQOSGuaranteed) + qosCgroupDir1 := GetPodQoSRelativePath(corev1.PodQOSBurstable) + qosCgroupDir2 := GetPodQoSRelativePath(corev1.PodQOSBestEffort) + helper.WriteCgroupFileContents(qosCgroupDir, cpuset, "0-63") + helper.WriteCgroupFileContents(qosCgroupDir1, cpuset, "0-63") + helper.WriteCgroupFileContents(qosCgroupDir2, cpuset, "0-63") + podCgroupDir := filepath.Join(qosCgroupDir, "/kubepods-test-guaranteed-pod.slice") + podCgroupDir1 := filepath.Join(qosCgroupDir2, "/kubepods-test-besteffort-pod-0.slice") + podCgroupDir2 := filepath.Join(qosCgroupDir2, "/kubepods-test-besteffort-pod-1.slice") + helper.WriteCgroupFileContents(podCgroupDir, cpuset, "0-63") + helper.WriteCgroupFileContents(podCgroupDir1, cpuset, "0-63") + helper.WriteCgroupFileContents(podCgroupDir2, cpuset, "0-31") + }, + }, + args: args{ + qos: corev1.PodQOSBestEffort, + resourceType: system.CPUSetCPUSName, + relativeDepth: PodCgroupPathRelativeDepth, + }, + wantErr: false, + want: []string{ + filepath.Join(GetPodQoSRelativePath(corev1.PodQOSBestEffort), "/kubepods-test-besteffort-pod-0.slice"), + filepath.Join(GetPodQoSRelativePath(corev1.PodQOSBestEffort), "/kubepods-test-besteffort-pod-1.slice"), + }, + }, + { + name: "get container-level cgroup path successfully", + fields: fields{ + prepareFn: func(helper *system.FileTestUtil) { + helper.SetCgroupsV2(false) + cpuset, _ := system.GetCgroupResource(system.CPUSetCPUSName) + qosCgroupDir := GetPodQoSRelativePath(corev1.PodQOSGuaranteed) + qosCgroupDir1 := GetPodQoSRelativePath(corev1.PodQOSBurstable) + qosCgroupDir2 := GetPodQoSRelativePath(corev1.PodQOSBestEffort) + helper.WriteCgroupFileContents(qosCgroupDir, cpuset, "0-63") + helper.WriteCgroupFileContents(qosCgroupDir1, cpuset, "0-63") + helper.WriteCgroupFileContents(qosCgroupDir2, cpuset, "0-63") + containerCgroupDir := filepath.Join(qosCgroupDir, "/kubepods-test-guaranteed-pod.slice/cri-containerd-xxx.scope") + containerCgroupDir1 := filepath.Join(qosCgroupDir2, "/kubepods-test-besteffort-pod-0.slice/cri-containerd-yyy.scope") + containerCgroupDir2 := filepath.Join(qosCgroupDir2, "/kubepods-test-besteffort-pod-1.slice/cri-containerd-zzz.scope") + helper.WriteCgroupFileContents(containerCgroupDir, cpuset, "0-63") + helper.WriteCgroupFileContents(containerCgroupDir1, cpuset, "0-63") + helper.WriteCgroupFileContents(containerCgroupDir2, cpuset, "0-31") + }, + }, + args: args{ + qos: corev1.PodQOSBestEffort, + resourceType: system.CPUSetCPUSName, + relativeDepth: ContainerCgroupPathRelativeDepth, + }, + wantErr: false, + want: []string{ + filepath.Join(GetPodQoSRelativePath(corev1.PodQOSBestEffort), "/kubepods-test-besteffort-pod-0.slice/cri-containerd-yyy.scope"), + filepath.Join(GetPodQoSRelativePath(corev1.PodQOSBestEffort), "/kubepods-test-besteffort-pod-1.slice/cri-containerd-zzz.scope"), + }, + }, + { + name: "get container-level cgroup path successfully on cgroup-v2", + fields: fields{ + prepareFn: func(helper *system.FileTestUtil) { + helper.SetCgroupsV2(true) + memoryLimit, _ := system.GetCgroupResource(system.MemoryLimitName) + qosCgroupDir := GetPodQoSRelativePath(corev1.PodQOSGuaranteed) + qosCgroupDir1 := GetPodQoSRelativePath(corev1.PodQOSBurstable) + qosCgroupDir2 := GetPodQoSRelativePath(corev1.PodQOSBestEffort) + helper.WriteCgroupFileContents(qosCgroupDir, memoryLimit, "107374182400") + helper.WriteCgroupFileContents(qosCgroupDir1, memoryLimit, "107374182400") + helper.WriteCgroupFileContents(qosCgroupDir2, memoryLimit, "107374182400") + containerCgroupDir := filepath.Join(qosCgroupDir, "/kubepods-test-guaranteed-pod.slice/cri-containerd-xxx.scope") + containerCgroupDir1 := filepath.Join(qosCgroupDir2, "/kubepods-test-besteffort-pod-0.slice/cri-containerd-yyy.scope") + containerCgroupDir2 := filepath.Join(qosCgroupDir2, "/kubepods-test-besteffort-pod-1.slice/cri-containerd-zzz.scope") + helper.WriteCgroupFileContents(containerCgroupDir, memoryLimit, "1073741824") + helper.WriteCgroupFileContents(containerCgroupDir1, memoryLimit, "1073741824") + helper.WriteCgroupFileContents(containerCgroupDir2, memoryLimit, "1073741824") + }, + }, + args: args{ + qos: corev1.PodQOSBestEffort, + resourceType: system.MemoryLimitName, + relativeDepth: ContainerCgroupPathRelativeDepth, + }, + wantErr: false, + want: []string{ + filepath.Join(GetPodQoSRelativePath(corev1.PodQOSBestEffort), "/kubepods-test-besteffort-pod-0.slice/cri-containerd-yyy.scope"), + filepath.Join(GetPodQoSRelativePath(corev1.PodQOSBestEffort), "/kubepods-test-besteffort-pod-1.slice/cri-containerd-zzz.scope"), + }, + }, + { + name: "get pod-level cgroup path for guaranteed successfully", + fields: fields{ + prepareFn: func(helper *system.FileTestUtil) { + helper.SetCgroupsV2(false) + cpuset, _ := system.GetCgroupResource(system.CPUSetCPUSName) + qosCgroupDir := GetPodQoSRelativePath(corev1.PodQOSGuaranteed) + qosCgroupDir1 := GetPodQoSRelativePath(corev1.PodQOSBurstable) + qosCgroupDir2 := GetPodQoSRelativePath(corev1.PodQOSBestEffort) + helper.WriteCgroupFileContents(qosCgroupDir, cpuset, "0-63") + helper.WriteCgroupFileContents(qosCgroupDir1, cpuset, "0-63") + helper.WriteCgroupFileContents(qosCgroupDir2, cpuset, "0-63") + containerCgroupDir := filepath.Join(qosCgroupDir, "/kubepods-test-guaranteed-pod.slice") + containerCgroupDir1 := filepath.Join(qosCgroupDir2, "/kubepods-test-besteffort-pod-0.slice") + containerCgroupDir2 := filepath.Join(qosCgroupDir2, "/kubepods-test-besteffort-pod-1.slice") + helper.WriteCgroupFileContents(containerCgroupDir, cpuset, "0-63") + helper.WriteCgroupFileContents(containerCgroupDir1, cpuset, "0-63") + helper.WriteCgroupFileContents(containerCgroupDir2, cpuset, "0-31") + }, + }, + args: args{ + qos: corev1.PodQOSGuaranteed, + resourceType: system.CPUSetCPUSName, + relativeDepth: PodCgroupPathRelativeDepth, + }, + wantErr: false, + want: []string{ + GetPodQoSRelativePath(corev1.PodQOSBestEffort), + GetPodQoSRelativePath(corev1.PodQOSBurstable), + filepath.Join(GetPodQoSRelativePath(corev1.PodQOSGuaranteed), "/kubepods-test-guaranteed-pod.slice"), + }, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + helper := system.NewFileTestUtil(t) + defer helper.Cleanup() + if tt.fields.prepareFn != nil { + tt.fields.prepareFn(helper) + } + + got, gotErr := GetCgroupPathsByTargetDepth(tt.args.qos, tt.args.resourceType, tt.args.relativeDepth) + assert.Equal(t, tt.wantErr, gotErr != nil, gotErr) + assert.Equal(t, tt.want, got) + }) + } } diff --git a/pkg/koordlet/util/system/core_sched.go b/pkg/koordlet/util/system/core_sched.go index 900b8e5d5..ca6a7b390 100644 --- a/pkg/koordlet/util/system/core_sched.go +++ b/pkg/koordlet/util/system/core_sched.go @@ -268,6 +268,32 @@ func (f *FakeCoreSchedExtended) Assign(pidTypeFrom CoreSchedScopeType, pidFrom u return nil, nil } +func IsCoreSchedSysctlSupported() bool { + return FileExists(GetProcSysFilePath(KernelSchedCore)) +} + +// IsCoreSchedSupported checks if the kernel supports the core scheduling. +// Currently, it relies on the interfaces provided by the Anolis OS. +func IsCoreSchedSupported() (bool, string) { + // kernel supports if: + // a) sysctl has sched_core, + // b) sched_features has SCHED_CORE/NO_SCHED_CORE + if IsCoreSchedSysctlSupported() { + return true, "sysctl supported" + } + + isSchedFeaturesSuppported, msg := SchedFeatures.IsSupported("") + if !isSchedFeaturesSuppported { // sched_features unavailable + return false, msg + } + _, err := IsCoreSchedFeatureEnabled() + if err == nil { + return true, "sched_features supported" + } + + return false, "not supported neither by sysctl nor by sched_features" +} + // EnableCoreSchedIfSupported checks if the core scheduling feature is enabled in the kernel sched_features. // If kernel supported (available in the latest Anolis OS), it tries to enable the core scheduling feature. // The core sched's kernel feature is known set in two places, if both of them are not found, the system is considered diff --git a/pkg/koordlet/util/system/system_file.go b/pkg/koordlet/util/system/system_file.go index be513aa2d..c28ccb0d1 100644 --- a/pkg/koordlet/util/system/system_file.go +++ b/pkg/koordlet/util/system/system_file.go @@ -147,6 +147,20 @@ func (*ProcSysctl) SetSysctl(sysctl string, newVal int) error { return os.WriteFile(GetProcSysFilePath(sysctl), []byte(strconv.Itoa(newVal)), 0640) } +func IsGroupIdentitySysctlSupported() bool { + return FileExists(GetProcSysFilePath(KernelSchedGroupIdentityEnable)) +} + +func GetSchedGroupIdentity() (bool, error) { + s := NewProcSysctl() + // 0: disabled; 1: enabled + cur, err := s.GetSysctl(KernelSchedGroupIdentityEnable) + if err != nil { + return false, fmt.Errorf("cannot get sysctl group identity, err: %w", err) + } + return cur == 1, nil +} + func SetSchedGroupIdentity(enable bool) error { s := NewProcSysctl() cur, err := s.GetSysctl(KernelSchedGroupIdentityEnable) @@ -164,7 +178,7 @@ func SetSchedGroupIdentity(enable bool) error { err = s.SetSysctl(KernelSchedGroupIdentityEnable, v) if err != nil { - return fmt.Errorf("cannot set sysctl group identity, err: %v", err) + return fmt.Errorf("cannot set sysctl group identity, err: %w", err) } klog.V(4).Infof("SetSchedGroupIdentity set sysctl config successfully, value %v", v) return nil