Skip to content

Commit

Permalink
slo-controller: refactor codes for reading (koordinator-sh#1973)
Browse files Browse the repository at this point in the history
Signed-off-by: Fansong Zeng <fanster.z@gmail.com>
Signed-off-by: george <xiangzhihua@gmail.com>
  • Loading branch information
hormes authored and georgexiang committed Apr 15, 2024
1 parent a2157ee commit c415bfb
Show file tree
Hide file tree
Showing 6 changed files with 119 additions and 95 deletions.
Binary file added docs/images/node-resource-model.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
25 changes: 19 additions & 6 deletions pkg/slo-controller/noderesource/plugins/batchresource/plugin.go
Original file line number Diff line number Diff line change
Expand Up @@ -211,6 +211,18 @@ func (p *Plugin) calculate(strategy *configuration.ColocationStrategy, node *cor
}, nil
}

// In order to support the colocation requirements of different enterprise environments, a configurable colocation strategy is provided.
// The resource view from the node perspective is as follows:
//
// https://github.com/koordinator-sh/koordinator/blob/main/docs/images/node-resource-model.png
//
// Typical colocation scenario:
// 1. default policy, and the CPU and memory that can be collocated are automatically calculated based on the load level of the node.
// 2. default policy on CPU, and the Memory is configured not to be overcommitted. This can reduce the probability of batch pods
// being killed due to high memory water levels (reduce the kill rate)
//
// In each scenario, users can also adjust the resource water level configuration according to your own needs and control the deployment
// density of batch pods.
func (p *Plugin) calculateOnNode(strategy *configuration.ColocationStrategy, node *corev1.Node, podList *corev1.PodList,
resourceMetrics *framework.ResourceMetrics) (corev1.ResourceList, string, string) {
// compute the requests and usages according to the pods' priority classes.
Expand Down Expand Up @@ -293,7 +305,7 @@ func (p *Plugin) calculateOnNode(strategy *configuration.ColocationStrategy, nod
"cpu", podsDanglingUsed.Cpu().String(), "memory", podsDanglingUsed.Memory().String())

nodeCapacity := getNodeCapacity(node)
nodeReservation := getNodeReservation(strategy, nodeCapacity)
nodeSafetyMargin := getNodeSafetyMargin(strategy, nodeCapacity)

systemUsed := getResourceListForCPUAndMemory(nodeMetric.Status.NodeMetric.SystemUsage.ResourceList)
// resource usage of host applications with prod priority will be count as host system usage since they consumes the
Expand All @@ -303,9 +315,10 @@ func (p *Plugin) calculateOnNode(strategy *configuration.ColocationStrategy, nod
// System.Reserved = Node.Anno.Reserved, Node.Kubelet.Reserved)
nodeAnnoReserved := util.GetNodeReservationFromAnnotation(node.Annotations)
nodeKubeletReserved := util.GetNodeReservationFromKubelet(node)
systemReserved := quotav1.Max(nodeKubeletReserved, nodeAnnoReserved)
// FIXME: resource reservation taking max is rather confusing.
nodeReserved := quotav1.Max(nodeKubeletReserved, nodeAnnoReserved)

batchAllocatable, cpuMsg, memMsg := calculateBatchResourceByPolicy(strategy, nodeCapacity, nodeReservation, systemReserved,
batchAllocatable, cpuMsg, memMsg := calculateBatchResourceByPolicy(strategy, nodeCapacity, nodeSafetyMargin, nodeReserved,
systemUsed, podsHPRequest, podsHPUsed, podsHPMaxUsedReq)
metrics.RecordNodeExtendedResourceAllocatableInternal(node, string(extension.BatchCPU), metrics.UnitInteger, float64(batchAllocatable.Cpu().MilliValue())/1000)
metrics.RecordNodeExtendedResourceAllocatableInternal(node, string(extension.BatchMemory), metrics.UnitByte, float64(batchAllocatable.Memory().Value()))
Expand Down Expand Up @@ -366,7 +379,7 @@ func (p *Plugin) calculateOnNUMALevel(strategy *configuration.ColocationStrategy
systemUsed = quotav1.Add(systemUsed, hostAppHPUsed)
nodeAnnoReserved := util.GetNodeReservationFromAnnotation(node.Annotations)
nodeKubeletReserved := util.GetNodeReservationFromKubelet(node)
systemReserved := quotav1.Max(nodeKubeletReserved, nodeAnnoReserved)
nodeReserved := quotav1.Max(nodeKubeletReserved, nodeAnnoReserved)

for i, zone := range nrt.Zones {
zoneIdxMap[i] = zone.Name
Expand All @@ -380,9 +393,9 @@ func (p *Plugin) calculateOnNUMALevel(strategy *configuration.ColocationStrategy
nodeZoneAllocatable[i][corev1.ResourceName(resourceInfo.Name)] = resourceInfo.Allocatable.DeepCopy()
}
}
nodeZoneReserve[i] = getNodeReservation(strategy, nodeZoneAllocatable[i])
nodeZoneReserve[i] = getNodeSafetyMargin(strategy, nodeZoneAllocatable[i])
systemZoneUsed[i] = divideResourceList(systemUsed, float64(zoneNum))
systemZoneReserved[i] = divideResourceList(systemReserved, float64(zoneNum))
systemZoneReserved[i] = divideResourceList(nodeReserved, float64(zoneNum))
}
podMetricMap := make(map[string]*slov1alpha1.PodMetricInfo)
podMetricUnknownMap := make(map[string]*slov1alpha1.PodMetricInfo)
Expand Down
Loading

0 comments on commit c415bfb

Please sign in to comment.