From 3e53d016e96c963905ecd116581bb3264c93dfa1 Mon Sep 17 00:00:00 2001 From: xilinxing 00347057 Date: Thu, 12 May 2022 10:04:32 +0800 Subject: [PATCH 01/18] bugfix: api-server deny empty admission response with PatchType set Signed-off-by: xilinxing 00347057 --- pkg/webhooks/admission/jobs/mutate/mutate_job.go | 6 ++++-- .../admission/podgroups/mutate/mutate_podgroup.go | 13 ++++++++----- pkg/webhooks/admission/pods/mutate/mutate_pod.go | 7 +++++-- .../admission/queues/mutate/mutate_queue.go | 13 ++++++++----- 4 files changed, 25 insertions(+), 14 deletions(-) diff --git a/pkg/webhooks/admission/jobs/mutate/mutate_job.go b/pkg/webhooks/admission/jobs/mutate/mutate_job.go index f96cc8967f..374d729d5b 100644 --- a/pkg/webhooks/admission/jobs/mutate/mutate_job.go +++ b/pkg/webhooks/admission/jobs/mutate/mutate_job.go @@ -98,8 +98,10 @@ func Jobs(ar admissionv1.AdmissionReview) *admissionv1.AdmissionResponse { Allowed: true, Patch: patchBytes, } - pt := admissionv1.PatchTypeJSONPatch - reviewResponse.PatchType = &pt + if len(patchBytes) > 0 { + pt := admissionv1.PatchTypeJSONPatch + reviewResponse.PatchType = &pt + } return &reviewResponse } diff --git a/pkg/webhooks/admission/podgroups/mutate/mutate_podgroup.go b/pkg/webhooks/admission/podgroups/mutate/mutate_podgroup.go index c65acc8b86..c7b8864cde 100644 --- a/pkg/webhooks/admission/podgroups/mutate/mutate_podgroup.go +++ b/pkg/webhooks/admission/podgroups/mutate/mutate_podgroup.go @@ -87,12 +87,15 @@ func PodGroups(ar admissionv1.AdmissionReview) *admissionv1.AdmissionResponse { } } - pt := admissionv1.PatchTypeJSONPatch - return &admissionv1.AdmissionResponse{ - Allowed: true, - Patch: patchBytes, - PatchType: &pt, + reviewResponse := admissionv1.AdmissionResponse{ + Allowed: true, + Patch: patchBytes, } + if len(patchBytes) > 0 { + pt := admissionv1.PatchTypeJSONPatch + reviewResponse.PatchType = &pt + } + return &reviewResponse } func createPodGroupPatch(podgroup *schedulingv1beta1.PodGroup) ([]byte, error) { diff --git a/pkg/webhooks/admission/pods/mutate/mutate_pod.go b/pkg/webhooks/admission/pods/mutate/mutate_pod.go index cd1d084280..309390ccf5 100644 --- a/pkg/webhooks/admission/pods/mutate/mutate_pod.go +++ b/pkg/webhooks/admission/pods/mutate/mutate_pod.go @@ -19,6 +19,7 @@ package mutate import ( "encoding/json" "fmt" + admissionv1 "k8s.io/api/admission/v1" whv1 "k8s.io/api/admissionregistration/v1" v1 "k8s.io/api/core/v1" @@ -90,8 +91,10 @@ func Pods(ar admissionv1.AdmissionReview) *admissionv1.AdmissionResponse { Allowed: true, Patch: patchBytes, } - pt := admissionv1.PatchTypeJSONPatch - reviewResponse.PatchType = &pt + if len(patchBytes) > 0 { + pt := admissionv1.PatchTypeJSONPatch + reviewResponse.PatchType = &pt + } return &reviewResponse } diff --git a/pkg/webhooks/admission/queues/mutate/mutate_queue.go b/pkg/webhooks/admission/queues/mutate/mutate_queue.go index b39546b0b5..4845c41cd9 100644 --- a/pkg/webhooks/admission/queues/mutate/mutate_queue.go +++ b/pkg/webhooks/admission/queues/mutate/mutate_queue.go @@ -88,12 +88,15 @@ func Queues(ar admissionv1.AdmissionReview) *admissionv1.AdmissionResponse { } } - pt := admissionv1.PatchTypeJSONPatch - return &admissionv1.AdmissionResponse{ - Allowed: true, - Patch: patchBytes, - PatchType: &pt, + reviewResponse := admissionv1.AdmissionResponse{ + Allowed: true, + Patch: patchBytes, } + if len(patchBytes) > 0 { + pt := admissionv1.PatchTypeJSONPatch + reviewResponse.PatchType = &pt + } + return &reviewResponse } func createQueuePatch(queue *schedulingv1beta1.Queue) ([]byte, error) { From b9b61cca1ed46d94a89333402178d712372aa1e7 Mon Sep 17 00:00:00 2001 From: yongjiahe Date: Thu, 12 May 2022 15:49:47 +0800 Subject: [PATCH 02/18] feat exclude unhealthy devices Signed-off-by: yongjiahe --- pkg/scheduler/api/node_info.go | 26 ++++++++++++++++++++++++++ pkg/scheduler/api/well_known_labels.go | 3 +++ 2 files changed, 29 insertions(+) diff --git a/pkg/scheduler/api/node_info.go b/pkg/scheduler/api/node_info.go index b1a55f8478..52c1be4d2c 100644 --- a/pkg/scheduler/api/node_info.go +++ b/pkg/scheduler/api/node_info.go @@ -19,6 +19,7 @@ package api import ( "fmt" "strconv" + "strings" v1 "k8s.io/api/core/v1" "k8s.io/klog" @@ -327,6 +328,10 @@ func (ni *NodeInfo) setNodeGPUInfo(node *v1.Node) { for i := 0; i < int(gpuNumber); i++ { ni.GPUDevices[i] = NewGPUDevice(i, memoryPerCard) } + unhealthyGPUs := ni.getUnhealthyGPUs(node) + for id := range unhealthyGPUs { + delete(ni.GPUDevices, id) + } } // SetNode sets kubernetes node object to nodeInfo object @@ -580,3 +585,24 @@ func (ni *NodeInfo) SubGPUResource(pod *v1.Pod) { } } } + +// getUnhealthyGPUs returns all the unhealthy GPU id. +func (ni *NodeInfo) getUnhealthyGPUs(node *v1.Node) (unhealthyGPUs map[int]bool) { + unhealthyGPUs = map[int]bool{} + devicesStr, ok := node.Annotations[UnhealthyGPUIDs] + + if !ok { + return + } + + idsStr := strings.Split(devicesStr, ",") + for _, sid := range idsStr { + id, err := strconv.Atoi(sid) + if err != nil { + klog.Warningf("Failed to parse unhealthy gpu id %s due to %v", sid, err) + } else { + unhealthyGPUs[id] = true + } + } + return +} diff --git a/pkg/scheduler/api/well_known_labels.go b/pkg/scheduler/api/well_known_labels.go index 52197f88fa..3b58820e82 100644 --- a/pkg/scheduler/api/well_known_labels.go +++ b/pkg/scheduler/api/well_known_labels.go @@ -28,6 +28,9 @@ const ( // GPUIndex is the key of gpu index GPUIndex = "volcano.sh/gpu-index" + // UnhealthyGPUIndexes list of unhealthy gpu ids + UnhealthyGPUIDs = "volcano.sh/gpu-unhealthy-ids" + // OversubscriptionNode is the key of node oversubscription OversubscriptionNode = "volcano.sh/oversubscription" // OversubscriptionCPU is the key of cpu oversubscription From cfe58da314a33eb335d38febb5941bcc697ce505 Mon Sep 17 00:00:00 2001 From: yongjiahe Date: Thu, 12 May 2022 16:03:14 +0800 Subject: [PATCH 03/18] feat log Signed-off-by: yongjiahe --- pkg/scheduler/api/node_info.go | 1 + 1 file changed, 1 insertion(+) diff --git a/pkg/scheduler/api/node_info.go b/pkg/scheduler/api/node_info.go index 52c1be4d2c..d675d2b95d 100644 --- a/pkg/scheduler/api/node_info.go +++ b/pkg/scheduler/api/node_info.go @@ -330,6 +330,7 @@ func (ni *NodeInfo) setNodeGPUInfo(node *v1.Node) { } unhealthyGPUs := ni.getUnhealthyGPUs(node) for id := range unhealthyGPUs { + klog.Info("delete unhealthy gpu id %d from GPUDevices", id) delete(ni.GPUDevices, id) } } From d7af9898b706f1faef4ad0d80ea5bff2dd38fb1e Mon Sep 17 00:00:00 2001 From: yongjiahe Date: Thu, 12 May 2022 16:52:19 +0800 Subject: [PATCH 04/18] fix comment Signed-off-by: yongjiahe --- pkg/scheduler/api/well_known_labels.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/scheduler/api/well_known_labels.go b/pkg/scheduler/api/well_known_labels.go index 3b58820e82..54bf830339 100644 --- a/pkg/scheduler/api/well_known_labels.go +++ b/pkg/scheduler/api/well_known_labels.go @@ -28,7 +28,7 @@ const ( // GPUIndex is the key of gpu index GPUIndex = "volcano.sh/gpu-index" - // UnhealthyGPUIndexes list of unhealthy gpu ids + // UnhealthyGPUIds list of unhealthy gpu ids UnhealthyGPUIDs = "volcano.sh/gpu-unhealthy-ids" // OversubscriptionNode is the key of node oversubscription From 8da360b291ecb27e43df42af1044df6f12ac388c Mon Sep 17 00:00:00 2001 From: yongjiahe Date: Fri, 13 May 2022 11:26:11 +0800 Subject: [PATCH 05/18] fix comment Signed-off-by: yongjiahe --- pkg/scheduler/api/well_known_labels.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/scheduler/api/well_known_labels.go b/pkg/scheduler/api/well_known_labels.go index 54bf830339..2e1db71c74 100644 --- a/pkg/scheduler/api/well_known_labels.go +++ b/pkg/scheduler/api/well_known_labels.go @@ -28,7 +28,7 @@ const ( // GPUIndex is the key of gpu index GPUIndex = "volcano.sh/gpu-index" - // UnhealthyGPUIds list of unhealthy gpu ids + // UnhealthyGPUIDs list of unhealthy gpu ids UnhealthyGPUIDs = "volcano.sh/gpu-unhealthy-ids" // OversubscriptionNode is the key of node oversubscription From 94b1d777f5bb7f63755f38b699b57e30e7c040bb Mon Sep 17 00:00:00 2001 From: yongjiahe Date: Fri, 13 May 2022 11:26:49 +0800 Subject: [PATCH 06/18] fix log Signed-off-by: yongjiahe --- pkg/scheduler/api/node_info.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/scheduler/api/node_info.go b/pkg/scheduler/api/node_info.go index d675d2b95d..16782a7c0e 100644 --- a/pkg/scheduler/api/node_info.go +++ b/pkg/scheduler/api/node_info.go @@ -330,7 +330,7 @@ func (ni *NodeInfo) setNodeGPUInfo(node *v1.Node) { } unhealthyGPUs := ni.getUnhealthyGPUs(node) for id := range unhealthyGPUs { - klog.Info("delete unhealthy gpu id %d from GPUDevices", id) + klog.V(4).Infof("delete unhealthy gpu id %d from GPUDevices", id) delete(ni.GPUDevices, id) } } From a03221a739e6d8fb66eb5c4c73f609b72aa11550 Mon Sep 17 00:00:00 2001 From: yongjiahe Date: Fri, 13 May 2022 11:48:11 +0800 Subject: [PATCH 07/18] fix unhealthy gpu data struc array Signed-off-by: yongjiahe --- pkg/scheduler/api/node_info.go | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/pkg/scheduler/api/node_info.go b/pkg/scheduler/api/node_info.go index 16782a7c0e..1ffed8719b 100644 --- a/pkg/scheduler/api/node_info.go +++ b/pkg/scheduler/api/node_info.go @@ -329,9 +329,9 @@ func (ni *NodeInfo) setNodeGPUInfo(node *v1.Node) { ni.GPUDevices[i] = NewGPUDevice(i, memoryPerCard) } unhealthyGPUs := ni.getUnhealthyGPUs(node) - for id := range unhealthyGPUs { - klog.V(4).Infof("delete unhealthy gpu id %d from GPUDevices", id) - delete(ni.GPUDevices, id) + for i := range unhealthyGPUs { + klog.V(4).Infof("delete unhealthy gpu id %d from GPUDevices", unhealthyGPUs[i]) + delete(ni.GPUDevices, unhealthyGPUs[i]) } } @@ -588,8 +588,8 @@ func (ni *NodeInfo) SubGPUResource(pod *v1.Pod) { } // getUnhealthyGPUs returns all the unhealthy GPU id. -func (ni *NodeInfo) getUnhealthyGPUs(node *v1.Node) (unhealthyGPUs map[int]bool) { - unhealthyGPUs = map[int]bool{} +func (ni *NodeInfo) getUnhealthyGPUs(node *v1.Node) (unhealthyGPUs []int) { + unhealthyGPUs = []int{} devicesStr, ok := node.Annotations[UnhealthyGPUIDs] if !ok { @@ -602,7 +602,7 @@ func (ni *NodeInfo) getUnhealthyGPUs(node *v1.Node) (unhealthyGPUs map[int]bool) if err != nil { klog.Warningf("Failed to parse unhealthy gpu id %s due to %v", sid, err) } else { - unhealthyGPUs[id] = true + unhealthyGPUs = append(unhealthyGPUs, id) } } return From d4d104ff738ae12cfd061538afe0c780e2fd1dc4 Mon Sep 17 00:00:00 2001 From: shinytang6 <1074461480@qq.com> Date: Thu, 12 May 2022 16:22:49 +0800 Subject: [PATCH 08/18] feat(deploy): add more deploy switches on helm Signed-off-by: shinytang6 <1074461480@qq.com> --- installer/README.md | 4 ++++ installer/helm/chart/volcano/templates/admission.yaml | 2 ++ installer/helm/chart/volcano/templates/controllers.yaml | 2 ++ installer/helm/chart/volcano/templates/scheduler.yaml | 4 +++- installer/helm/chart/volcano/values.yaml | 4 +++- 5 files changed, 14 insertions(+), 2 deletions(-) diff --git a/installer/README.md b/installer/README.md index 66309b8f8a..8567269b76 100644 --- a/installer/README.md +++ b/installer/README.md @@ -79,6 +79,10 @@ The following are the list configurable parameters of Volcano Chart and their de |`basic.admission_app_name`|Admission Controller App Name|`volcano-admission`| |`basic.controller_app_name`|Controller App Name|`volcano-controller`| |`basic.scheduler_app_name`|Scheduler App Name|`volcano-scheduler`| +|`custom.metrics_enable`|Whether to Enable Metrics|`false`| +|`custom.admission_enable`|Whether to Enable Admission|`true`| +|`custom.controller_enable`|Whether to Enable Controller|`true`| +|`custom.scheduler_enable`|Whether to Enable Scheduler|`true`| Specify each parameter using the `--set key=value[,key=value]` argument to `helm install`. For example, diff --git a/installer/helm/chart/volcano/templates/admission.yaml b/installer/helm/chart/volcano/templates/admission.yaml index 5700b5a5bc..50256f61d6 100644 --- a/installer/helm/chart/volcano/templates/admission.yaml +++ b/installer/helm/chart/volcano/templates/admission.yaml @@ -1,3 +1,4 @@ +{{- if .Values.custom.admission_enable }} apiVersion: v1 kind: ConfigMap metadata: @@ -153,3 +154,4 @@ spec: imagePullPolicy: IfNotPresent command: ["./gen-admission-secret.sh", "--service", "{{ .Release.Name }}-admission-service", "--namespace", "{{ .Release.Namespace }}", "--secret", "{{.Values.basic.admission_secret_name}}"] +{{- end }} \ No newline at end of file diff --git a/installer/helm/chart/volcano/templates/controllers.yaml b/installer/helm/chart/volcano/templates/controllers.yaml index 70db686ca6..5354fa06b4 100644 --- a/installer/helm/chart/volcano/templates/controllers.yaml +++ b/installer/helm/chart/volcano/templates/controllers.yaml @@ -1,3 +1,4 @@ +{{- if .Values.custom.controller_enable }} apiVersion: v1 kind: ServiceAccount metadata: @@ -105,3 +106,4 @@ spec: - -v=4 - 2>&1 imagePullPolicy: "IfNotPresent" +{{- end }} \ No newline at end of file diff --git a/installer/helm/chart/volcano/templates/scheduler.yaml b/installer/helm/chart/volcano/templates/scheduler.yaml index a382b57891..650390bff5 100644 --- a/installer/helm/chart/volcano/templates/scheduler.yaml +++ b/installer/helm/chart/volcano/templates/scheduler.yaml @@ -1,3 +1,4 @@ +{{- if .Values.custom.scheduler_enable }} apiVersion: v1 kind: ConfigMap metadata: @@ -145,4 +146,5 @@ spec: targetPort: 8080 selector: app: volcano-scheduler - type: ClusterIP \ No newline at end of file + type: ClusterIP +{{- end }} \ No newline at end of file diff --git a/installer/helm/chart/volcano/values.yaml b/installer/helm/chart/volcano/values.yaml index eeba4f01a0..dec749edf9 100644 --- a/installer/helm/chart/volcano/values.yaml +++ b/installer/helm/chart/volcano/values.yaml @@ -6,8 +6,10 @@ basic: admission_secret_name: "volcano-admission-secret" admission_config_file: "config/volcano-admission.conf" scheduler_config_file: "config/volcano-scheduler.conf" - image_pull_secret: "" admission_port: 8443 custom: metrics_enable: false + admission_enable: true + controller_enable: true + scheduler_enable: true From 8d124fedf11a5afe3e2b8753171fda342079442c Mon Sep 17 00:00:00 2001 From: wpeng102 Date: Fri, 13 May 2022 15:00:45 +0800 Subject: [PATCH 09/18] high priority task cannot preemt low priority task when queue is overused and cluster has idle resource Signed-off-by: wpeng102 --- pkg/scheduler/actions/preempt/preempt.go | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/pkg/scheduler/actions/preempt/preempt.go b/pkg/scheduler/actions/preempt/preempt.go index ffb2e76d12..467f0cb4e6 100644 --- a/pkg/scheduler/actions/preempt/preempt.go +++ b/pkg/scheduler/actions/preempt/preempt.go @@ -110,7 +110,7 @@ func (pmpt *Action) Execute(ssn *framework.Session) { preemptor := preemptorTasks[preemptorJob.UID].Pop().(*api.TaskInfo) - if preempted, _ := preempt(ssn, stmt, preemptor, func(task *api.TaskInfo) bool { + if preempted, _ := preempt(ssn, stmt, preemptor, preemptorJob, func(task *api.TaskInfo) bool { // Ignore non running task. if task.Status != api.Running { return false @@ -165,7 +165,7 @@ func (pmpt *Action) Execute(ssn *framework.Session) { preemptor := preemptorTasks[job.UID].Pop().(*api.TaskInfo) stmt := framework.NewStatement(ssn) - assigned, _ := preempt(ssn, stmt, preemptor, func(task *api.TaskInfo) bool { + assigned, _ := preempt(ssn, stmt, preemptor, job, func(task *api.TaskInfo) bool { // Ignore non running task. if task.Status != api.Running { return false @@ -197,6 +197,7 @@ func preempt( ssn *framework.Session, stmt *framework.Statement, preemptor *api.TaskInfo, + job *api.JobInfo, filter func(*api.TaskInfo) bool, predicateHelper util.PredicateHelper, ) (bool, error) { @@ -209,6 +210,9 @@ func preempt( nodeScores := util.PrioritizeNodes(preemptor, predicateNodes, ssn.BatchNodeOrderFn, ssn.NodeOrderMapFn, ssn.NodeOrderReduceFn) selectedNodes := util.SortNodes(nodeScores) + + currentQueue := ssn.Queues[job.Queue] + for _, node := range selectedNodes { klog.V(3).Infof("Considering Task <%s/%s> on Node <%s>.", preemptor.Namespace, preemptor.Name, node.Name) @@ -240,7 +244,8 @@ func preempt( for !victimsQueue.Empty() { // If reclaimed enough resources, break loop to avoid Sub panic. - if preemptor.InitResreq.LessEqual(node.FutureIdle(), api.Zero) { + // If preemptor's queue is overused, it means preemptor can not be allcated. So no ned care about the node idle resourace + if !ssn.Overused(currentQueue) && preemptor.InitResreq.LessEqual(node.FutureIdle(), api.Zero) { break } preemptee := victimsQueue.Pop().(*api.TaskInfo) @@ -258,7 +263,8 @@ func preempt( klog.V(3).Infof("Preempted <%v> for Task <%s/%s> requested <%v>.", preempted, preemptor.Namespace, preemptor.Name, preemptor.InitResreq) - if preemptor.InitResreq.LessEqual(node.FutureIdle(), api.Zero) { + // If preemptor's queue is overused, it means preemptor can not be allcated. So no ned care about the node idle resourace + if !ssn.Overused(currentQueue) && preemptor.InitResreq.LessEqual(node.FutureIdle(), api.Zero) { if err := stmt.Pipeline(preemptor, node.Name); err != nil { klog.Errorf("Failed to pipeline Task <%s/%s> on Node <%s>", preemptor.Namespace, preemptor.Name, node.Name) From 9a597e9d96dbda97ff9af1a3d757f27933b3aa9b Mon Sep 17 00:00:00 2001 From: wpeng102 Date: Mon, 30 May 2022 14:30:32 +0800 Subject: [PATCH 10/18] address comments Signed-off-by: wpeng102 --- pkg/scheduler/actions/preempt/preempt.go | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/pkg/scheduler/actions/preempt/preempt.go b/pkg/scheduler/actions/preempt/preempt.go index 467f0cb4e6..080b6d4e8c 100644 --- a/pkg/scheduler/actions/preempt/preempt.go +++ b/pkg/scheduler/actions/preempt/preempt.go @@ -17,6 +17,8 @@ limitations under the License. package preempt import ( + "fmt" + "k8s.io/klog" "volcano.sh/volcano/pkg/scheduler/api" @@ -110,7 +112,7 @@ func (pmpt *Action) Execute(ssn *framework.Session) { preemptor := preemptorTasks[preemptorJob.UID].Pop().(*api.TaskInfo) - if preempted, _ := preempt(ssn, stmt, preemptor, preemptorJob, func(task *api.TaskInfo) bool { + if preempted, _ := preempt(ssn, stmt, preemptor, func(task *api.TaskInfo) bool { // Ignore non running task. if task.Status != api.Running { return false @@ -165,7 +167,7 @@ func (pmpt *Action) Execute(ssn *framework.Session) { preemptor := preemptorTasks[job.UID].Pop().(*api.TaskInfo) stmt := framework.NewStatement(ssn) - assigned, _ := preempt(ssn, stmt, preemptor, job, func(task *api.TaskInfo) bool { + assigned, _ := preempt(ssn, stmt, preemptor, func(task *api.TaskInfo) bool { // Ignore non running task. if task.Status != api.Running { return false @@ -197,7 +199,6 @@ func preempt( ssn *framework.Session, stmt *framework.Statement, preemptor *api.TaskInfo, - job *api.JobInfo, filter func(*api.TaskInfo) bool, predicateHelper util.PredicateHelper, ) (bool, error) { @@ -211,6 +212,11 @@ func preempt( selectedNodes := util.SortNodes(nodeScores) + job, found := ssn.Jobs[preemptor.Job] + if !found { + return false, fmt.Errorf("Job %s not found in SSN", preemptor.Job) + } + currentQueue := ssn.Queues[job.Queue] for _, node := range selectedNodes { From a2ca248a0ca597cab7e19892b2a8c8b8d58f21d2 Mon Sep 17 00:00:00 2001 From: shaoqiu <516595344@qq.com> Date: Sat, 28 May 2022 18:02:30 +0800 Subject: [PATCH 11/18] avoid panic for query prometheus no data; Signed-off-by: shaoqiu <516595344@qq.com> --- pkg/scheduler/cache/cache.go | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/pkg/scheduler/cache/cache.go b/pkg/scheduler/cache/cache.go index 60a660cd74..7d3131e77a 100644 --- a/pkg/scheduler/cache/cache.go +++ b/pkg/scheduler/cache/cache.go @@ -1225,8 +1225,16 @@ func (sc *SchedulerCache) GetMetricsData() { if len(warnings) > 0 { klog.V(3).Infof("Warning querying Prometheus: %v", warnings) } + if res == nil || res.String() == "" { + klog.V(4).Infof("Warning querying Prometheus: no data found for %s", queryStr) + continue + } + klog.V(4).Infof("Query prometheus res %s", res.String()) rowValues := strings.Split(strings.TrimSpace(res.String()), "=>") + if len(rowValues) < 2 { + continue + } value := strings.Split(strings.TrimSpace(rowValues[1]), " ") switch metric { case cpuUsageAvg: From 7d399361207162a864aae23c39e36c713267b250 Mon Sep 17 00:00:00 2001 From: shaoqiu <516595344@qq.com> Date: Sat, 28 May 2022 18:39:31 +0800 Subject: [PATCH 12/18] add some comment; Signed-off-by: shaoqiu <516595344@qq.com> --- pkg/scheduler/cache/cache.go | 1 + 1 file changed, 1 insertion(+) diff --git a/pkg/scheduler/cache/cache.go b/pkg/scheduler/cache/cache.go index 7d3131e77a..49df76a8fa 100644 --- a/pkg/scheduler/cache/cache.go +++ b/pkg/scheduler/cache/cache.go @@ -1232,6 +1232,7 @@ func (sc *SchedulerCache) GetMetricsData() { klog.V(4).Infof("Query prometheus res %s", res.String()) rowValues := strings.Split(strings.TrimSpace(res.String()), "=>") + // ignore invalid prometheus.data to nodeUsage.map if len(rowValues) < 2 { continue } From 67bd83eb2253d329ca5df75426f561ac55a58d1a Mon Sep 17 00:00:00 2001 From: shaoqiu <516595344@qq.com> Date: Mon, 30 May 2022 11:59:51 +0800 Subject: [PATCH 13/18] modify prometheus.query.result judg; add node.resourceUsage log; Signed-off-by: shaoqiu <516595344@qq.com> --- pkg/scheduler/cache/cache.go | 28 +++++++++++++++++++++++++--- 1 file changed, 25 insertions(+), 3 deletions(-) diff --git a/pkg/scheduler/cache/cache.go b/pkg/scheduler/cache/cache.go index 49df76a8fa..3939852a56 100644 --- a/pkg/scheduler/cache/cache.go +++ b/pkg/scheduler/cache/cache.go @@ -26,6 +26,7 @@ import ( "time" "github.com/prometheus/client_golang/api" + pmodel "github.com/prometheus/common/model" prometheusv1 "github.com/prometheus/client_golang/api/prometheus/v1" v1 "k8s.io/api/core/v1" @@ -1231,11 +1232,15 @@ func (sc *SchedulerCache) GetMetricsData() { } klog.V(4).Infof("Query prometheus res %s", res.String()) - rowValues := strings.Split(strings.TrimSpace(res.String()), "=>") - // ignore invalid prometheus.data to nodeUsage.map - if len(rowValues) < 2 { + switch res.Type() { + case pmodel.ValScalar, pmodel.ValMatrix: + continue + case pmodel.ValVector: // type pmodel.ValVector need only + default: continue } + firstRowValVector := strings.Split(res.String(), "\n")[0] + rowValues := strings.Split(strings.TrimSpace(firstRowValVector), "=>") value := strings.Split(strings.TrimSpace(rowValues[1]), " ") switch metric { case cpuUsageAvg: @@ -1256,10 +1261,27 @@ func (sc *SchedulerCache) GetMetricsData() { func (sc *SchedulerCache) setMetricsData(usageInfo map[string]*schedulingapi.NodeUsage) { sc.Mutex.Lock() defer sc.Mutex.Unlock() + beforeNodeinfo := make(map[string]string, len(sc.Nodes)) + for nName, nInfo := range sc.Nodes { + beforeNodeinfo[nName] = fmt.Sprintf("%+v", *nInfo.ResourceUsage) + } for k := range usageInfo { nodeInfo, ok := sc.Nodes[k] if ok { nodeInfo.ResourceUsage = usageInfo[k] } } + // record node.ResourceUsage changes + for nName, nInfo := range sc.Nodes { + newResourceUsage := fmt.Sprintf("%+v", *nInfo.ResourceUsage) + oldResourceUsage, found := beforeNodeinfo[nName] + if !found { + klog.V(4).Infof("new node: %s, ResourceUsage: %s", nName, newResourceUsage) + continue + } + if oldResourceUsage == newResourceUsage { + continue + } + klog.V(4).Infof("node: %s, ResourceUsage: %s => %s", nName, oldResourceUsage, newResourceUsage) + } } From 3392c26f61e61074b22556d2ad47979066509e85 Mon Sep 17 00:00:00 2001 From: shaoqiu <516595344@qq.com> Date: Mon, 30 May 2022 13:03:40 +0800 Subject: [PATCH 14/18] simplify logging record; Signed-off-by: shaoqiu <516595344@qq.com> --- pkg/scheduler/cache/cache.go | 19 ++----------------- 1 file changed, 2 insertions(+), 17 deletions(-) diff --git a/pkg/scheduler/cache/cache.go b/pkg/scheduler/cache/cache.go index 3939852a56..c0944cd26a 100644 --- a/pkg/scheduler/cache/cache.go +++ b/pkg/scheduler/cache/cache.go @@ -1261,27 +1261,12 @@ func (sc *SchedulerCache) GetMetricsData() { func (sc *SchedulerCache) setMetricsData(usageInfo map[string]*schedulingapi.NodeUsage) { sc.Mutex.Lock() defer sc.Mutex.Unlock() - beforeNodeinfo := make(map[string]string, len(sc.Nodes)) - for nName, nInfo := range sc.Nodes { - beforeNodeinfo[nName] = fmt.Sprintf("%+v", *nInfo.ResourceUsage) - } + for k := range usageInfo { nodeInfo, ok := sc.Nodes[k] if ok { + klog.V(4).Infof("node: %s, ResourceUsage: %+v => %+v", k, *nodeInfo.ResourceUsage, *usageInfo[k]) nodeInfo.ResourceUsage = usageInfo[k] } } - // record node.ResourceUsage changes - for nName, nInfo := range sc.Nodes { - newResourceUsage := fmt.Sprintf("%+v", *nInfo.ResourceUsage) - oldResourceUsage, found := beforeNodeinfo[nName] - if !found { - klog.V(4).Infof("new node: %s, ResourceUsage: %s", nName, newResourceUsage) - continue - } - if oldResourceUsage == newResourceUsage { - continue - } - klog.V(4).Infof("node: %s, ResourceUsage: %s => %s", nName, oldResourceUsage, newResourceUsage) - } } From 1d2c4d78d63393a6ed911905779318fc72ad1fdd Mon Sep 17 00:00:00 2001 From: shaoqiu <516595344@qq.com> Date: Mon, 30 May 2022 16:58:45 +0800 Subject: [PATCH 15/18] lower some dataprint.log.level; Signed-off-by: shaoqiu <516595344@qq.com> --- pkg/scheduler/cache/cache.go | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pkg/scheduler/cache/cache.go b/pkg/scheduler/cache/cache.go index c0944cd26a..bc1afb00e2 100644 --- a/pkg/scheduler/cache/cache.go +++ b/pkg/scheduler/cache/cache.go @@ -1227,11 +1227,11 @@ func (sc *SchedulerCache) GetMetricsData() { klog.V(3).Infof("Warning querying Prometheus: %v", warnings) } if res == nil || res.String() == "" { - klog.V(4).Infof("Warning querying Prometheus: no data found for %s", queryStr) + klog.V(3).Infof("Warning querying Prometheus: no data found for %s", queryStr) continue } - klog.V(4).Infof("Query prometheus res %s", res.String()) + klog.V(3).Infof("Query prometheus res %s", res.String()) switch res.Type() { case pmodel.ValScalar, pmodel.ValMatrix: continue @@ -1265,7 +1265,7 @@ func (sc *SchedulerCache) setMetricsData(usageInfo map[string]*schedulingapi.Nod for k := range usageInfo { nodeInfo, ok := sc.Nodes[k] if ok { - klog.V(4).Infof("node: %s, ResourceUsage: %+v => %+v", k, *nodeInfo.ResourceUsage, *usageInfo[k]) + klog.V(3).Infof("node: %s, ResourceUsage: %+v => %+v", k, *nodeInfo.ResourceUsage, *usageInfo[k]) nodeInfo.ResourceUsage = usageInfo[k] } } From 312a12f48199a407f96dc6bc584db3c9519755c6 Mon Sep 17 00:00:00 2001 From: shaoqiu <516595344@qq.com> Date: Mon, 30 May 2022 17:20:42 +0800 Subject: [PATCH 16/18] simplify somecode; Signed-off-by: shaoqiu <516595344@qq.com> --- pkg/scheduler/cache/cache.go | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/pkg/scheduler/cache/cache.go b/pkg/scheduler/cache/cache.go index bc1afb00e2..e29d61b21e 100644 --- a/pkg/scheduler/cache/cache.go +++ b/pkg/scheduler/cache/cache.go @@ -1227,18 +1227,14 @@ func (sc *SchedulerCache) GetMetricsData() { klog.V(3).Infof("Warning querying Prometheus: %v", warnings) } if res == nil || res.String() == "" { - klog.V(3).Infof("Warning querying Prometheus: no data found for %s", queryStr) + klog.Warning("Warning querying Prometheus: no data found for %s", queryStr) continue } - - klog.V(3).Infof("Query prometheus res %s", res.String()) - switch res.Type() { - case pmodel.ValScalar, pmodel.ValMatrix: - continue - case pmodel.ValVector: // type pmodel.ValVector need only - default: + // plugin.usage only need type pmodel.ValVector in Prometheus.rulues + if res.Type() != pmodel.ValVector { continue } + // only method res.String() can get data, dataType []pmodel.ValVector, eg: "{k1:v1, ...} => #[value] @#[timespace]\n {k2:v2, ...} => ..." firstRowValVector := strings.Split(res.String(), "\n")[0] rowValues := strings.Split(strings.TrimSpace(firstRowValVector), "=>") value := strings.Split(strings.TrimSpace(rowValues[1]), " ") From 8c6af32fbdd58901be584d25d32d125742ef1916 Mon Sep 17 00:00:00 2001 From: shaoqiu <516595344@qq.com> Date: Mon, 30 May 2022 18:50:43 +0800 Subject: [PATCH 17/18] debug; Signed-off-by: shaoqiu <516595344@qq.com> --- pkg/scheduler/cache/cache.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pkg/scheduler/cache/cache.go b/pkg/scheduler/cache/cache.go index e29d61b21e..289b8228ea 100644 --- a/pkg/scheduler/cache/cache.go +++ b/pkg/scheduler/cache/cache.go @@ -26,8 +26,8 @@ import ( "time" "github.com/prometheus/client_golang/api" - pmodel "github.com/prometheus/common/model" prometheusv1 "github.com/prometheus/client_golang/api/prometheus/v1" + pmodel "github.com/prometheus/common/model" v1 "k8s.io/api/core/v1" schedulingv1 "k8s.io/api/scheduling/v1" @@ -1227,7 +1227,7 @@ func (sc *SchedulerCache) GetMetricsData() { klog.V(3).Infof("Warning querying Prometheus: %v", warnings) } if res == nil || res.String() == "" { - klog.Warning("Warning querying Prometheus: no data found for %s", queryStr) + klog.Warningf("Warning querying Prometheus: no data found for %s", queryStr) continue } // plugin.usage only need type pmodel.ValVector in Prometheus.rulues From a4f20dcc98b2183a1060f2ea7a43e1c0d9504139 Mon Sep 17 00:00:00 2001 From: shaoqiu <516595344@qq.com> Date: Mon, 30 May 2022 22:51:32 +0800 Subject: [PATCH 18/18] local vendor update; Signed-off-by: shaoqiu <516595344@qq.com> --- go.mod | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/go.mod b/go.mod index 03ccc2651e..54a96a69c8 100644 --- a/go.mod +++ b/go.mod @@ -12,6 +12,7 @@ require ( github.com/onsi/ginkgo/v2 v2.0.0 github.com/onsi/gomega v1.17.0 github.com/prometheus/client_golang v1.12.0 + github.com/prometheus/common v0.32.1 github.com/spf13/cobra v1.2.1 github.com/spf13/pflag v1.0.5 go.uber.org/automaxprocs v1.4.0 @@ -72,7 +73,6 @@ require ( github.com/opencontainers/selinux v1.8.2 // indirect github.com/pkg/errors v0.9.1 // indirect github.com/prometheus/client_model v0.2.0 // indirect - github.com/prometheus/common v0.32.1 // indirect github.com/prometheus/procfs v0.7.3 // indirect go.etcd.io/etcd/api/v3 v3.5.0 // indirect go.etcd.io/etcd/client/pkg/v3 v3.5.0 // indirect