WiP: topology-aware: support for CPU allocator priorities.

Add support for configurable default and annotated per-container CPU priority preferences. These determine the preferred priority for CPUs when doing fully or partially exclusive CPU allocation. Priorities are calculated for such allocations and passed on to the CPU allocator which then tries to fulfill these preferences. It should now be possible to configure the policy to allocate (exclusive) E-cores by default and P-cores to containers which are annotated so, or to do it the other way around. Signed-off-by: Krisztian Litkey <krisztian.litkey@intel.com>
containers · Mar 14, 2024 · 7a8e9fe · 7a8e9fe
1 parent ecb4ced
commit 7a8e9fe
Show file tree

Hide file tree

Showing 7 changed files with 258 additions and 37 deletions.
diff --git a/cmd/plugins/topology-aware/policy/pod-preferences.go b/cmd/plugins/topology-aware/policy/pod-preferences.go
@@ -42,6 +42,8 @@ const (
 	keyColdStartPreference = "cold-start"
 	// annotation key for reserved pools
 	keyReservedCPUsPreference = "prefer-reserved-cpus"
+	// annotation key for CPU Priority preference
+	keyCpuPriorityPreference = "prefer-cpu-priority"
 
 	// effective annotation key for isolated CPU preference
 	preferIsolatedCPUsKey = keyIsolationPreference + "." + kubernetes.ResmgrKeyNamespace
@@ -53,6 +55,8 @@ const (
 	preferColdStartKey = keyColdStartPreference + "." + kubernetes.ResmgrKeyNamespace
 	// annotation key for reserved pools
 	preferReservedCPUsKey = keyReservedCPUsPreference + "." + kubernetes.ResmgrKeyNamespace
+	// effective annotation key for CPU priority preference
+	preferCpuPriorityKey = keyCpuPriorityPreference + "." + kubernetes.ResmgrKeyNamespace
 )
 
 // cpuClass is a type of CPU to allocate
@@ -153,6 +157,36 @@ func sharedCPUsPreference(pod cache.Pod, container cache.Container) (bool, bool)
 	return preference, true
 }
 
+// cpuPrioPreference returns the CPU priority preference for the given container
+// and whether the container was explicitly annotated with this setting.
+func cpuPrioPreference(pod cache.Pod, container cache.Container, fallback cpuPrio) (cpuPrio, bool) {
+	key := preferCpuPriorityKey
+	value, ok := pod.GetEffectiveAnnotation(key, container.GetName())
+
+	if !ok {
+		prio := fallback
+		log.Debug("%s: implicit CPU priority preference %q", container.PrettyName(), prio)
+		return prio, false
+	}
+
+	if value == "default" {
+		prio := defaultPrio
+		log.Debug("%s: explicit CPU priority preference %q", container.PrettyName(), prio)
+		return prio, true
+	}
+
+	prio, ok := cpuPrioByName[value]
+	if !ok {
+		log.Error("%s: invalid CPU priority preference %q", container.PrettyName(), value)
+		prio := fallback
+		log.Debug("%s: implicit CPU priority preference %q", container.PrettyName(), prio)
+		return prio, false
+	}
+
+	log.Debug("%s: explicit CPU priority preference %q", container.PrettyName(), prio)
+	return prio, true
+}
+
 // memoryTypePreference returns what type of memory should be allocated for the container.
 //
 // If the effective annotations are not found, this function falls back to
@@ -370,7 +404,7 @@ func checkReservedCPUsAnnotations(c cache.Container) (bool, bool) {
 // 2. fraction: amount of fractional CPU in milli-CPU
 // 3. isolate: (bool) whether to prefer isolated full CPUs
 // 4. cpuType: (cpuClass) class of CPU to allocate (reserved vs. normal)
-func cpuAllocationPreferences(pod cache.Pod, container cache.Container) (int, int, bool, cpuClass) {
+func cpuAllocationPreferences(pod cache.Pod, container cache.Container) (int, int, bool, cpuClass, cpuPrio) {
 	//
 	// CPU allocation preferences for a container consist of
 	//
@@ -439,60 +473,62 @@ func cpuAllocationPreferences(pod cache.Pod, container cache.Container) (int, in
 	request := reqs.Requests[corev1.ResourceCPU]
 	qosClass := pod.GetQOSClass()
 	fraction := int(request.MilliValue())
+	prio := defaultPrio // ignored for fractional allocations
 
 	// easy cases: kube-system namespace, Burstable or BestEffort QoS class containers
 	preferReserved, explicitReservation := checkReservedCPUsAnnotations(container)
 	switch {
 	case container.PreserveCpuResources():
-		return 0, fraction, false, cpuPreserve
+		return 0, fraction, false, cpuPreserve, prio
 	case preferReserved == true:
-		return 0, fraction, false, cpuReserved
+		return 0, fraction, false, cpuReserved, prio
 	case checkReservedPoolNamespaces(namespace) && !explicitReservation:
-		return 0, fraction, false, cpuReserved
+		return 0, fraction, false, cpuReserved, prio
 	case qosClass == corev1.PodQOSBurstable:
-		return 0, fraction, false, cpuNormal
+		return 0, fraction, false, cpuNormal, prio
 	case qosClass == corev1.PodQOSBestEffort:
-		return 0, 0, false, cpuNormal
+		return 0, 0, false, cpuNormal, prio
 	}
 
 	// complex case: Guaranteed QoS class containers
 	cores := fraction / 1000
 	fraction = fraction % 1000
 	preferIsolated, explicitIsolated := isolatedCPUsPreference(pod, container)
 	preferShared, explicitShared := sharedCPUsPreference(pod, container)
+	prio, _ = cpuPrioPreference(pod, container, defaultPrio) // ignored for fractional allocations
 
 	switch {
 	// sub-core CPU request
 	case cores == 0:
-		return 0, fraction, false, cpuNormal
+		return 0, fraction, false, cpuNormal, prio
 		// 1 <= CPU request < 2
 	case cores < 2:
 		// fractional allocation, potentially mixed
 		if fraction > 0 {
 			if preferShared {
-				return 0, 1000*cores + fraction, false, cpuNormal
+				return 0, 1000*cores + fraction, false, cpuNormal, prio
 			}
-			return cores, fraction, preferIsolated, cpuNormal
+			return cores, fraction, preferIsolated, cpuNormal, prio
 		}
 		// non-fractional allocation
 		if preferShared && explicitShared {
-			return 0, 1000*cores + fraction, false, cpuNormal
+			return 0, 1000*cores + fraction, false, cpuNormal, prio
 		}
-		return cores, fraction, preferIsolated, cpuNormal
+		return cores, fraction, preferIsolated, cpuNormal, prio
 		// CPU request >= 2
 	default:
 		// fractional allocation, only mixed if explicitly annotated as unshared
 		if fraction > 0 {
 			if !preferShared && explicitShared {
-				return cores, fraction, preferIsolated && explicitIsolated, cpuNormal
+				return cores, fraction, preferIsolated && explicitIsolated, cpuNormal, prio
 			}
-			return 0, 1000*cores + fraction, false, cpuNormal
+			return 0, 1000*cores + fraction, false, cpuNormal, prio
 		}
 		// non-fractional allocation
 		if preferShared && explicitShared {
-			return 0, 1000 * cores, false, cpuNormal
+			return 0, 1000 * cores, false, cpuNormal, prio
 		}
-		return cores, fraction, preferIsolated && explicitIsolated, cpuNormal
+		return cores, fraction, preferIsolated && explicitIsolated, cpuNormal, prio
 	}
 }
 

diff --git a/cmd/plugins/topology-aware/policy/pools.go b/cmd/plugins/topology-aware/policy/pools.go
@@ -960,6 +960,37 @@ func (p *policy) compareScores(request Request, pools []Node, scores map[int]Sco
 		}
 	}
 
+	// X. for LowPrio and HighPrio CPU preference, the only fulfilling node wins
+	log.Debug("  - preferred CPU priority is %s", request.CPUPrio())
+	switch request.CPUPrio() {
+	case lowPrio:
+		lp1, lp2 := score1.LowPrio(), score2.LowPrio()
+		log.Debug("  - lp1 %d vs. lp2 %d", lp1, lp2)
+		switch {
+		case lp1 == lp2:
+			log.Debug("  - LowPrio CPU capacity is a TIE")
+		case lp1 >= 0 && lp2 < 0:
+			log.Debug("  => %s WINS based on LowPrio CPU capacity", node1.Name())
+			return true
+		case lp1 < 0 && lp2 >= 0:
+			log.Debug("  => %s WINS based on LowPrio CPU capacity", node1.Name())
+			return false
+		}
+
+	case highPrio:
+		hp1, hp2 := score1.HighPrio(), score2.HighPrio()
+		switch {
+		case hp1 == hp2:
+			log.Debug("  - HighPrio CPU capacity is a TIE")
+		case hp1 >= 0 && hp2 < 0:
+			log.Debug("  => %s WINS based on HighPrio CPU capacity", node1.Name())
+			return true
+		case hp1 < 0 && hp2 >= 0:
+			log.Debug("  => %s WINS based on HighPrio CPU capacity", node1.Name())
+			return false
+		}
+	}
+
 	// 5) a lower node wins
 	if depth1 > depth2 {
 		log.Debug("  => %s WINS on depth", node1.Name())
@@ -1001,6 +1032,22 @@ func (p *policy) compareScores(request Request, pools []Node, scores map[int]Sco
 			return id1 < id2
 		}
 
+		// X. for NormalPrio CPU preference, the only fulfilling node wins
+		log.Debug("  - preferred CPU priority is %s", request.CPUPrio())
+		if request.CPUPrio() == normalPrio {
+			np1, np2 := score1.NormalPrio(), score2.NormalPrio()
+			switch {
+			case np1 == np2:
+				log.Debug("  - NormalPrio CPU capacity is a TIE")
+			case np1 >= 0 && np2 < 0:
+				log.Debug("  => %s WINS based on NormalPrio CPU capacity", node1.Name())
+				return true
+			case np1 < 0 && np2 >= 0:
+				log.Debug("  => %s WINS based on NormalPrio capacity", node1.Name())
+				return false
+			}
+		}
+
 		// 8) more slicable shared capacity wins
 		if request.FullCPUs() > 0 && (shared1 > 0 || shared2 > 0) {
 			if shared1 > shared2 {