Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

slo-controller: refactor codes for reading #1973

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file added docs/images/node-resource-model.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
25 changes: 19 additions & 6 deletions pkg/slo-controller/noderesource/plugins/batchresource/plugin.go
Original file line number Diff line number Diff line change
Expand Up @@ -211,6 +211,18 @@ func (p *Plugin) calculate(strategy *configuration.ColocationStrategy, node *cor
}, nil
}

// In order to support the colocation requirements of different enterprise environments, a configurable colocation strategy is provided.
// The resource view from the node perspective is as follows:
//
// https://github.com/koordinator-sh/koordinator/blob/main/docs/images/node-resource-model.png
//
// Typical colocation scenario:
// 1. default policy, and the CPU and memory that can be collocated are automatically calculated based on the load level of the node.
// 2. default policy on CPU, and the Memory is configured not to be overcommitted. This can reduce the probability of batch pods
// being killed due to high memory water levels (reduce the kill rate)
//
// In each scenario, users can also adjust the resource water level configuration according to your own needs and control the deployment
// density of batch pods.
func (p *Plugin) calculateOnNode(strategy *configuration.ColocationStrategy, node *corev1.Node, podList *corev1.PodList,
resourceMetrics *framework.ResourceMetrics) (corev1.ResourceList, string, string) {
// compute the requests and usages according to the pods' priority classes.
Expand Down Expand Up @@ -293,7 +305,7 @@ func (p *Plugin) calculateOnNode(strategy *configuration.ColocationStrategy, nod
"cpu", podsDanglingUsed.Cpu().String(), "memory", podsDanglingUsed.Memory().String())

nodeCapacity := getNodeCapacity(node)
nodeReservation := getNodeReservation(strategy, nodeCapacity)
nodeSafetyMargin := getNodeSafetyMargin(strategy, nodeCapacity)

systemUsed := getResourceListForCPUAndMemory(nodeMetric.Status.NodeMetric.SystemUsage.ResourceList)
// resource usage of host applications with prod priority will be count as host system usage since they consumes the
Expand All @@ -303,9 +315,10 @@ func (p *Plugin) calculateOnNode(strategy *configuration.ColocationStrategy, nod
// System.Reserved = Node.Anno.Reserved, Node.Kubelet.Reserved)
nodeAnnoReserved := util.GetNodeReservationFromAnnotation(node.Annotations)
nodeKubeletReserved := util.GetNodeReservationFromKubelet(node)
systemReserved := quotav1.Max(nodeKubeletReserved, nodeAnnoReserved)
// FIXME: resource reservation taking max is rather confusing.
nodeReserved := quotav1.Max(nodeKubeletReserved, nodeAnnoReserved)

batchAllocatable, cpuMsg, memMsg := calculateBatchResourceByPolicy(strategy, nodeCapacity, nodeReservation, systemReserved,
batchAllocatable, cpuMsg, memMsg := calculateBatchResourceByPolicy(strategy, nodeCapacity, nodeSafetyMargin, nodeReserved,
systemUsed, podsHPRequest, podsHPUsed, podsHPMaxUsedReq)
metrics.RecordNodeExtendedResourceAllocatableInternal(node, string(extension.BatchCPU), metrics.UnitInteger, float64(batchAllocatable.Cpu().MilliValue())/1000)
metrics.RecordNodeExtendedResourceAllocatableInternal(node, string(extension.BatchMemory), metrics.UnitByte, float64(batchAllocatable.Memory().Value()))
Expand Down Expand Up @@ -366,7 +379,7 @@ func (p *Plugin) calculateOnNUMALevel(strategy *configuration.ColocationStrategy
systemUsed = quotav1.Add(systemUsed, hostAppHPUsed)
nodeAnnoReserved := util.GetNodeReservationFromAnnotation(node.Annotations)
nodeKubeletReserved := util.GetNodeReservationFromKubelet(node)
systemReserved := quotav1.Max(nodeKubeletReserved, nodeAnnoReserved)
nodeReserved := quotav1.Max(nodeKubeletReserved, nodeAnnoReserved)

for i, zone := range nrt.Zones {
zoneIdxMap[i] = zone.Name
Expand All @@ -380,9 +393,9 @@ func (p *Plugin) calculateOnNUMALevel(strategy *configuration.ColocationStrategy
nodeZoneAllocatable[i][corev1.ResourceName(resourceInfo.Name)] = resourceInfo.Allocatable.DeepCopy()
}
}
nodeZoneReserve[i] = getNodeReservation(strategy, nodeZoneAllocatable[i])
nodeZoneReserve[i] = getNodeSafetyMargin(strategy, nodeZoneAllocatable[i])
systemZoneUsed[i] = divideResourceList(systemUsed, float64(zoneNum))
systemZoneReserved[i] = divideResourceList(systemReserved, float64(zoneNum))
systemZoneReserved[i] = divideResourceList(nodeReserved, float64(zoneNum))
}
podMetricMap := make(map[string]*slov1alpha1.PodMetricInfo)
podMetricUnknownMap := make(map[string]*slov1alpha1.PodMetricInfo)
Expand Down
Loading
Loading