From 7a8e9fecd9718b1e612c79dfef1775fa30d91a36 Mon Sep 17 00:00:00 2001 From: Krisztian Litkey Date: Wed, 13 Mar 2024 00:03:53 +0200 Subject: [PATCH] WiP: topology-aware: support for CPU allocator priorities. Add support for configurable default and annotated per-container CPU priority preferences. These determine the preferred priority for CPUs when doing fully or partially exclusive CPU allocation. Priorities are calculated for such allocations and passed on to the CPU allocator which then tries to fulfill these preferences. It should now be possible to configure the policy to allocate (exclusive) E-cores by default and P-cores to containers which are annotated so, or to do it the other way around. Signed-off-by: Krisztian Litkey --- .../topology-aware/policy/pod-preferences.go | 66 +++++++--- cmd/plugins/topology-aware/policy/pools.go | 47 +++++++ .../topology-aware/policy/resources.go | 117 ++++++++++++++---- .../policy/topology-aware-policy.go | 7 +- .../config.nri_topologyawarepolicies.yaml | 13 ++ .../config.nri_topologyawarepolicies.yaml | 13 ++ .../resmgr/policy/topologyaware/config.go | 32 +++++ 7 files changed, 258 insertions(+), 37 deletions(-) diff --git a/cmd/plugins/topology-aware/policy/pod-preferences.go b/cmd/plugins/topology-aware/policy/pod-preferences.go index af1bb1b90..ce35215c9 100644 --- a/cmd/plugins/topology-aware/policy/pod-preferences.go +++ b/cmd/plugins/topology-aware/policy/pod-preferences.go @@ -42,6 +42,8 @@ const ( keyColdStartPreference = "cold-start" // annotation key for reserved pools keyReservedCPUsPreference = "prefer-reserved-cpus" + // annotation key for CPU Priority preference + keyCpuPriorityPreference = "prefer-cpu-priority" // effective annotation key for isolated CPU preference preferIsolatedCPUsKey = keyIsolationPreference + "." + kubernetes.ResmgrKeyNamespace @@ -53,6 +55,8 @@ const ( preferColdStartKey = keyColdStartPreference + "." + kubernetes.ResmgrKeyNamespace // annotation key for reserved pools preferReservedCPUsKey = keyReservedCPUsPreference + "." + kubernetes.ResmgrKeyNamespace + // effective annotation key for CPU priority preference + preferCpuPriorityKey = keyCpuPriorityPreference + "." + kubernetes.ResmgrKeyNamespace ) // cpuClass is a type of CPU to allocate @@ -153,6 +157,36 @@ func sharedCPUsPreference(pod cache.Pod, container cache.Container) (bool, bool) return preference, true } +// cpuPrioPreference returns the CPU priority preference for the given container +// and whether the container was explicitly annotated with this setting. +func cpuPrioPreference(pod cache.Pod, container cache.Container, fallback cpuPrio) (cpuPrio, bool) { + key := preferCpuPriorityKey + value, ok := pod.GetEffectiveAnnotation(key, container.GetName()) + + if !ok { + prio := fallback + log.Debug("%s: implicit CPU priority preference %q", container.PrettyName(), prio) + return prio, false + } + + if value == "default" { + prio := defaultPrio + log.Debug("%s: explicit CPU priority preference %q", container.PrettyName(), prio) + return prio, true + } + + prio, ok := cpuPrioByName[value] + if !ok { + log.Error("%s: invalid CPU priority preference %q", container.PrettyName(), value) + prio := fallback + log.Debug("%s: implicit CPU priority preference %q", container.PrettyName(), prio) + return prio, false + } + + log.Debug("%s: explicit CPU priority preference %q", container.PrettyName(), prio) + return prio, true +} + // memoryTypePreference returns what type of memory should be allocated for the container. // // If the effective annotations are not found, this function falls back to @@ -370,7 +404,7 @@ func checkReservedCPUsAnnotations(c cache.Container) (bool, bool) { // 2. fraction: amount of fractional CPU in milli-CPU // 3. isolate: (bool) whether to prefer isolated full CPUs // 4. cpuType: (cpuClass) class of CPU to allocate (reserved vs. normal) -func cpuAllocationPreferences(pod cache.Pod, container cache.Container) (int, int, bool, cpuClass) { +func cpuAllocationPreferences(pod cache.Pod, container cache.Container) (int, int, bool, cpuClass, cpuPrio) { // // CPU allocation preferences for a container consist of // @@ -439,20 +473,21 @@ func cpuAllocationPreferences(pod cache.Pod, container cache.Container) (int, in request := reqs.Requests[corev1.ResourceCPU] qosClass := pod.GetQOSClass() fraction := int(request.MilliValue()) + prio := defaultPrio // ignored for fractional allocations // easy cases: kube-system namespace, Burstable or BestEffort QoS class containers preferReserved, explicitReservation := checkReservedCPUsAnnotations(container) switch { case container.PreserveCpuResources(): - return 0, fraction, false, cpuPreserve + return 0, fraction, false, cpuPreserve, prio case preferReserved == true: - return 0, fraction, false, cpuReserved + return 0, fraction, false, cpuReserved, prio case checkReservedPoolNamespaces(namespace) && !explicitReservation: - return 0, fraction, false, cpuReserved + return 0, fraction, false, cpuReserved, prio case qosClass == corev1.PodQOSBurstable: - return 0, fraction, false, cpuNormal + return 0, fraction, false, cpuNormal, prio case qosClass == corev1.PodQOSBestEffort: - return 0, 0, false, cpuNormal + return 0, 0, false, cpuNormal, prio } // complex case: Guaranteed QoS class containers @@ -460,39 +495,40 @@ func cpuAllocationPreferences(pod cache.Pod, container cache.Container) (int, in fraction = fraction % 1000 preferIsolated, explicitIsolated := isolatedCPUsPreference(pod, container) preferShared, explicitShared := sharedCPUsPreference(pod, container) + prio, _ = cpuPrioPreference(pod, container, defaultPrio) // ignored for fractional allocations switch { // sub-core CPU request case cores == 0: - return 0, fraction, false, cpuNormal + return 0, fraction, false, cpuNormal, prio // 1 <= CPU request < 2 case cores < 2: // fractional allocation, potentially mixed if fraction > 0 { if preferShared { - return 0, 1000*cores + fraction, false, cpuNormal + return 0, 1000*cores + fraction, false, cpuNormal, prio } - return cores, fraction, preferIsolated, cpuNormal + return cores, fraction, preferIsolated, cpuNormal, prio } // non-fractional allocation if preferShared && explicitShared { - return 0, 1000*cores + fraction, false, cpuNormal + return 0, 1000*cores + fraction, false, cpuNormal, prio } - return cores, fraction, preferIsolated, cpuNormal + return cores, fraction, preferIsolated, cpuNormal, prio // CPU request >= 2 default: // fractional allocation, only mixed if explicitly annotated as unshared if fraction > 0 { if !preferShared && explicitShared { - return cores, fraction, preferIsolated && explicitIsolated, cpuNormal + return cores, fraction, preferIsolated && explicitIsolated, cpuNormal, prio } - return 0, 1000*cores + fraction, false, cpuNormal + return 0, 1000*cores + fraction, false, cpuNormal, prio } // non-fractional allocation if preferShared && explicitShared { - return 0, 1000 * cores, false, cpuNormal + return 0, 1000 * cores, false, cpuNormal, prio } - return cores, fraction, preferIsolated && explicitIsolated, cpuNormal + return cores, fraction, preferIsolated && explicitIsolated, cpuNormal, prio } } diff --git a/cmd/plugins/topology-aware/policy/pools.go b/cmd/plugins/topology-aware/policy/pools.go index e3975e522..d9dde1f8c 100644 --- a/cmd/plugins/topology-aware/policy/pools.go +++ b/cmd/plugins/topology-aware/policy/pools.go @@ -960,6 +960,37 @@ func (p *policy) compareScores(request Request, pools []Node, scores map[int]Sco } } + // X. for LowPrio and HighPrio CPU preference, the only fulfilling node wins + log.Debug(" - preferred CPU priority is %s", request.CPUPrio()) + switch request.CPUPrio() { + case lowPrio: + lp1, lp2 := score1.LowPrio(), score2.LowPrio() + log.Debug(" - lp1 %d vs. lp2 %d", lp1, lp2) + switch { + case lp1 == lp2: + log.Debug(" - LowPrio CPU capacity is a TIE") + case lp1 >= 0 && lp2 < 0: + log.Debug(" => %s WINS based on LowPrio CPU capacity", node1.Name()) + return true + case lp1 < 0 && lp2 >= 0: + log.Debug(" => %s WINS based on LowPrio CPU capacity", node1.Name()) + return false + } + + case highPrio: + hp1, hp2 := score1.HighPrio(), score2.HighPrio() + switch { + case hp1 == hp2: + log.Debug(" - HighPrio CPU capacity is a TIE") + case hp1 >= 0 && hp2 < 0: + log.Debug(" => %s WINS based on HighPrio CPU capacity", node1.Name()) + return true + case hp1 < 0 && hp2 >= 0: + log.Debug(" => %s WINS based on HighPrio CPU capacity", node1.Name()) + return false + } + } + // 5) a lower node wins if depth1 > depth2 { log.Debug(" => %s WINS on depth", node1.Name()) @@ -1001,6 +1032,22 @@ func (p *policy) compareScores(request Request, pools []Node, scores map[int]Sco return id1 < id2 } + // X. for NormalPrio CPU preference, the only fulfilling node wins + log.Debug(" - preferred CPU priority is %s", request.CPUPrio()) + if request.CPUPrio() == normalPrio { + np1, np2 := score1.NormalPrio(), score2.NormalPrio() + switch { + case np1 == np2: + log.Debug(" - NormalPrio CPU capacity is a TIE") + case np1 >= 0 && np2 < 0: + log.Debug(" => %s WINS based on NormalPrio CPU capacity", node1.Name()) + return true + case np1 < 0 && np2 >= 0: + log.Debug(" => %s WINS based on NormalPrio capacity", node1.Name()) + return false + } + } + // 8) more slicable shared capacity wins if request.FullCPUs() > 0 && (shared1 > 0 || shared2 > 0) { if shared1 > shared2 { diff --git a/cmd/plugins/topology-aware/policy/resources.go b/cmd/plugins/topology-aware/policy/resources.go index b2e4ef210..7af1e0277 100644 --- a/cmd/plugins/topology-aware/policy/resources.go +++ b/cmd/plugins/topology-aware/policy/resources.go @@ -19,6 +19,7 @@ import ( "strconv" "time" + "github.com/containers/nri-plugins/pkg/sysfs" "github.com/containers/nri-plugins/pkg/utils/cpuset" v1 "k8s.io/api/core/v1" @@ -28,6 +29,28 @@ import ( idset "github.com/intel/goresctrl/pkg/utils" ) +type ( + cpuPrio = cpuallocator.CPUPriority +) + +const ( + highPrio = cpuallocator.PriorityHigh + normalPrio = cpuallocator.PriorityNormal + lowPrio = cpuallocator.PriorityLow + nonePrio = cpuallocator.PriorityNone +) + +var ( + defaultPrio = nonePrio + + cpuPrioByName = map[string]cpuPrio{ + "high": highPrio, + "normal": normalPrio, + "low": lowPrio, + "none": nonePrio, + } +) + // Supply represents avaialbe CPU and memory capacity of a node. type Supply interface { // GetNode returns the node supplying this capacity. @@ -95,6 +118,8 @@ type Request interface { String() string // CPUType returns the type of requested CPU. CPUType() cpuClass + // CPUPrio returns the preferred priority of requested CPU. + CPUPrio() cpuPrio // SetCPUType sets the type of requested CPU. SetCPUType(cpuType cpuClass) // FullCPUs return the number of full CPUs requested. @@ -195,6 +220,9 @@ type Score interface { SharedCapacity() int Colocated() int HintScores() map[string]float64 + LowPrio() int + HighPrio() int + NormalPrio() int String() string } @@ -223,6 +251,7 @@ type request struct { fraction int // amount of fractional CPU requested isolate bool // prefer isolated exclusive CPUs cpuType cpuClass // preferred CPU type (normal, reserved) + prio cpuPrio // CPU priority preference, ignored for fraction requests memReq uint64 // memory request memLim uint64 // memory limit @@ -257,13 +286,16 @@ var _ Grant = &grant{} // score implements our Score interface. type score struct { - supply Supply // CPU supply (node) - req Request // CPU request (container) - isolated int // remaining isolated CPUs - reserved int // remaining reserved CPUs - shared int // remaining shared capacity - colocated int // number of colocated containers - hints map[string]float64 // hint scores + supply Supply // CPU supply (node) + req Request // CPU request (container) + isolated int // remaining isolated CPUs + reserved int // remaining reserved CPUs + shared int // remaining shared capacity + lowPrio int // remaining low-priority CPUs + highPrio int // remaining high-priority CPUs + normalPrio int // normal-priority CPUs + colocated int // number of colocated containers + hints map[string]float64 // hint scores } var _ Score = &score{} @@ -575,7 +607,7 @@ func (cs *supply) AllocateCPU(r Request) (Grant, error) { // allocate isolated exclusive CPUs or slice them off the sharable set switch { case full > 0 && cs.isolated.Size() >= full && cr.isolate: - exclusive, err = cs.takeCPUs(&cs.isolated, nil, full) + exclusive, err = cs.takeCPUs(&cs.isolated, nil, full, cr.CPUPrio()) if err != nil { return nil, policyError("internal error: "+ "%s: can't take %d exclusive isolated CPUs from %s: %v", @@ -583,7 +615,7 @@ func (cs *supply) AllocateCPU(r Request) (Grant, error) { } case full > 0 && cs.AllocatableSharedCPU() > 1000*full: - exclusive, err = cs.takeCPUs(&cs.sharable, nil, full) + exclusive, err = cs.takeCPUs(&cs.sharable, nil, full, cr.CPUPrio()) if err != nil { return nil, policyError("internal error: "+ "%s: can't take %d exclusive CPUs from %s: %v", @@ -764,8 +796,8 @@ func (cs *supply) ReserveMemory(g Grant) error { } // takeCPUs takes up to cnt CPUs from a given CPU set to another. -func (cs *supply) takeCPUs(from, to *cpuset.CPUSet, cnt int) (cpuset.CPUSet, error) { - cset, err := cs.node.Policy().cpuAllocator.AllocateCpus(from, cnt, cpuallocator.PriorityHigh) +func (cs *supply) takeCPUs(from, to *cpuset.CPUSet, cnt int, prio cpuPrio) (cpuset.CPUSet, error) { + cset, err := cs.node.Policy().cpuAllocator.AllocateCpus(from, cnt, prio) if err != nil { return cset, err } @@ -942,12 +974,12 @@ func (cs *supply) DumpMemoryState(prefix string) { // newRequest creates a new request for the given container. func newRequest(container cache.Container) Request { pod, _ := container.GetPod() - full, fraction, isolate, cpuType := cpuAllocationPreferences(pod, container) + full, fraction, isolate, cpuType, prio := cpuAllocationPreferences(pod, container) req, lim, mtype := memoryAllocationPreference(pod, container) coldStart := time.Duration(0) - log.Debug("%s: CPU preferences: cpuType=%s, full=%v, fraction=%v, isolate=%v", - container.PrettyName(), cpuType, full, fraction, isolate) + log.Debug("%s: CPU preferences: cpuType=%s, full=%v, fraction=%v, isolate=%v, prio=%v", + container.PrettyName(), cpuType, full, fraction, isolate, prio) if mtype == memoryUnspec { mtype = defaultMemoryType @@ -984,6 +1016,7 @@ func newRequest(container cache.Container) Request { memLim: lim, memType: mtype, coldStart: coldStart, + prio: prio, } } @@ -998,19 +1031,19 @@ func (cr *request) String() string { isolated := map[bool]string{false: "", true: "isolated "}[cr.isolate] switch { case cr.full == 0 && cr.fraction == 0: - return fmt.Sprintf("") + mem + return fmt.Sprintf("<%s CPU request "+cr.container.PrettyName()+": ->", cr.prio) + mem case cr.full > 0 && cr.fraction > 0: - return fmt.Sprintf("", isolated, cr.full, cr.fraction) + mem + return fmt.Sprintf("<%s CPU request "+cr.container.PrettyName()+": "+ + "%sexclusive: %d, shared: %d>", cr.prio, isolated, cr.full, cr.fraction) + mem case cr.full > 0: - return fmt.Sprintf("", isolated, cr.full) + mem + return fmt.Sprintf("<%s CPU request "+ + cr.container.PrettyName()+": %sexclusive: %d>", cr.prio, isolated, cr.full) + mem default: - return fmt.Sprintf("", cr.fraction) + mem + return fmt.Sprintf("<%s CPU request "+ + cr.container.PrettyName()+": shared: %d>", cr.prio, cr.fraction) + mem } } @@ -1019,6 +1052,10 @@ func (cr *request) CPUType() cpuClass { return cr.cpuType } +func (cr *request) CPUPrio() cpuPrio { + return cr.prio +} + // SetCPUType sets the requested type of CPU for the grant. func (cr *request) SetCPUType(cpuType cpuClass) { cr.cpuType = cpuType @@ -1105,6 +1142,32 @@ func (cs *supply) GetScore(req Request) Score { // calculate fractional capacity score.shared -= part + + ecores := cs.GetNode().System().CoreKindCPUs(sysfs.EfficientCore) + lpCPUs := ecores + if ecores.Size() == 0 { + lpCPUs = cs.GetNode().Policy().cpuAllocator.GetCPUPriorities()[lowPrio] + } + lpCPUs = lpCPUs.Intersection(cs.SharableCPUs()) + lpCnt := lpCPUs.Size() + score.lowPrio = lpCnt*1000 - (1000*full + part) + + pcores := cs.GetNode().System().CoreKindCPUs(sysfs.PerformanceCore) + hpCPUs := pcores + if pcores.Size() == 0 { + hpCPUs = cs.GetNode().Policy().cpuAllocator.GetCPUPriorities()[highPrio] + } + hpCPUs = hpCPUs.Intersection(cs.SharableCPUs()) + hpCnt := hpCPUs.Size() + score.highPrio = hpCnt*1000 - (1000*full + part) + + npCPUs := pcores + if pcores.Size() == 0 { + npCPUs = cs.GetNode().Policy().cpuAllocator.GetCPUPriorities()[highPrio] + } + npCPUs = npCPUs.Intersection(cs.SharableCPUs()) + npCnt := npCPUs.Size() + score.normalPrio = npCnt*1000 - (1000*full + part) } // calculate colocation score @@ -1204,6 +1267,18 @@ func (score *score) HintScores() map[string]float64 { return score.hints } +func (score *score) LowPrio() int { + return score.lowPrio +} + +func (score *score) HighPrio() int { + return score.highPrio +} + +func (score *score) NormalPrio() int { + return score.normalPrio +} + func (score *score) String() string { return fmt.Sprintf("", score.supply.GetNode().Name(), score.isolated, score.reserved, score.shared, score.colocated, score.hints) diff --git a/cmd/plugins/topology-aware/policy/topology-aware-policy.go b/cmd/plugins/topology-aware/policy/topology-aware-policy.go index e25984c2f..efc829cc3 100644 --- a/cmd/plugins/topology-aware/policy/topology-aware-policy.go +++ b/cmd/plugins/topology-aware/policy/topology-aware-policy.go @@ -98,6 +98,7 @@ func (p *policy) Setup(opts *policyapi.BackendOptions) error { p.cpuAllocator = cpuallocator.NewCPUAllocator(opts.System) opt = cfg + defaultPrio = cfg.DefaultCPUPriority.Value() if err := p.initialize(); err != nil { return policyError("failed to initialize %s policy: %w", PolicyName, err) @@ -105,6 +106,8 @@ func (p *policy) Setup(opts *policyapi.BackendOptions) error { p.registerImplicitAffinities() + log.Info("***** default CPU priority is %s", defaultPrio) + return nil } @@ -425,6 +428,7 @@ func (p *policy) Reconfigure(newCfg interface{}) error { opt = cfg p.cfg = cfg + defaultPrio = cfg.DefaultCPUPriority.Value() if err := p.initialize(); err != nil { *p = savedPolicy @@ -435,6 +439,7 @@ func (p *policy) Reconfigure(newCfg interface{}) error { if err := grant.RefetchNodes(); err != nil { *p = savedPolicy opt = p.cfg + defaultPrio = p.cfg.DefaultCPUPriority.Value() return policyError("failed to reconfigure: %v", err) } } @@ -523,7 +528,7 @@ func (p *policy) checkConstraints() error { // Use CpuAllocator to pick reserved CPUs among // allowed ones. Because using those CPUs is allowed, // they remain (they are put back) in the allowed set. - cset, err := p.cpuAllocator.AllocateCpus(&p.allowed, p.reserveCnt, cpuallocator.PriorityNormal) + cset, err := p.cpuAllocator.AllocateCpus(&p.allowed, p.reserveCnt, normalPrio) p.allowed = p.allowed.Union(cset) if err != nil { log.Fatal("cannot reserve %dm CPUs for ReservedResources from AvailableResources: %s", qty.MilliValue(), err) diff --git a/config/crd/bases/config.nri_topologyawarepolicies.yaml b/config/crd/bases/config.nri_topologyawarepolicies.yaml index af7faacee..802b4da15 100644 --- a/config/crd/bases/config.nri_topologyawarepolicies.yaml +++ b/config/crd/bases/config.nri_topologyawarepolicies.yaml @@ -94,6 +94,19 @@ spec: - classes type: object type: object + defaultCPUPriority: + default: none + description: |- + DefaultCPUPriority (high, normal, low, none) is the preferred CPU + priority for allocated CPUs when a container has not been annotated + with any other CPU preference. + Notes: Currently this option only affects exclusive CPU allocations. + enum: + - high + - normal + - low + - none + type: string instrumentation: description: Config provides runtime configuration for instrumentation. properties: diff --git a/deployment/helm/topology-aware/crds/config.nri_topologyawarepolicies.yaml b/deployment/helm/topology-aware/crds/config.nri_topologyawarepolicies.yaml index af7faacee..802b4da15 100644 --- a/deployment/helm/topology-aware/crds/config.nri_topologyawarepolicies.yaml +++ b/deployment/helm/topology-aware/crds/config.nri_topologyawarepolicies.yaml @@ -94,6 +94,19 @@ spec: - classes type: object type: object + defaultCPUPriority: + default: none + description: |- + DefaultCPUPriority (high, normal, low, none) is the preferred CPU + priority for allocated CPUs when a container has not been annotated + with any other CPU preference. + Notes: Currently this option only affects exclusive CPU allocations. + enum: + - high + - normal + - low + - none + type: string instrumentation: description: Config provides runtime configuration for instrumentation. properties: diff --git a/pkg/apis/config/v1alpha1/resmgr/policy/topologyaware/config.go b/pkg/apis/config/v1alpha1/resmgr/policy/topologyaware/config.go index 5adfca087..d1e537215 100644 --- a/pkg/apis/config/v1alpha1/resmgr/policy/topologyaware/config.go +++ b/pkg/apis/config/v1alpha1/resmgr/policy/topologyaware/config.go @@ -15,7 +15,10 @@ package topologyaware import ( + "strings" + policy "github.com/containers/nri-plugins/pkg/apis/config/v1alpha1/resmgr/policy" + "github.com/containers/nri-plugins/pkg/cpuallocator" ) type ( @@ -33,6 +36,27 @@ const ( AmountCPUSet = policy.AmountCPUSet ) +type CPUPriority string + +const ( + PriorityHigh CPUPriority = "high" + PriorityNormal CPUPriority = "normal" + PriorityLow CPUPriority = "low" + PriorityNone CPUPriority = "none" +) + +func (p CPUPriority) Value() cpuallocator.CPUPriority { + switch strings.ToLower(string(p)) { + case string(PriorityHigh): + return cpuallocator.PriorityHigh + case string(PriorityNormal): + return cpuallocator.PriorityNormal + case string(PriorityLow): + return cpuallocator.PriorityLow + } + return cpuallocator.PriorityNone +} + // +k8s:deepcopy-gen=true // +optional type Config struct { @@ -77,4 +101,12 @@ type Config struct { // of it. // +kubebuilder:validation:Required ReservedResources Constraints `json:"reservedResources"` + // DefaultCPUPriority (high, normal, low, none) is the preferred CPU + // priority for allocated CPUs when a container has not been annotated + // with any other CPU preference. + // Notes: Currently this option only affects exclusive CPU allocations. + // +kubebuilder:validation:Enum=high;normal;low;none + // +kubebuilder:default=none + // +kubebuilder:validation:Format:string + DefaultCPUPriority CPUPriority `json:"defaultCPUPriority,omitempty"` }