Skip to content

Commit

Permalink
scheduler: add reservation level event
Browse files Browse the repository at this point in the history
Signed-off-by: 佑祎 <zzw261520@alibaba-inc.com>
  • Loading branch information
zwzhang0107 committed May 24, 2024
1 parent cd0ebb6 commit 6582887
Show file tree
Hide file tree
Showing 3 changed files with 221 additions and 12 deletions.
128 changes: 117 additions & 11 deletions pkg/scheduler/frameworkext/eventhandlers/reservation_handler.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,10 @@ package eventhandlers

import (
"context"
"fmt"
"regexp"
"strconv"
"strings"
"time"

corev1 "k8s.io/api/core/v1"
Expand All @@ -31,6 +35,7 @@ import (
"k8s.io/kubernetes/pkg/scheduler/framework"
"k8s.io/kubernetes/pkg/scheduler/profile"

"github.com/koordinator-sh/koordinator/apis/extension"
schedulingv1alpha1 "github.com/koordinator-sh/koordinator/apis/scheduling/v1alpha1"
koordclientset "github.com/koordinator-sh/koordinator/pkg/client/clientset/versioned"
koordinatorinformers "github.com/koordinator-sh/koordinator/pkg/client/informers/externalversions"
Expand Down Expand Up @@ -74,24 +79,125 @@ func MakeReservationErrorHandler(
}
}

if !reservationutil.IsReservePod(pod) {
if _, reserveAffExist := pod.Annotations[extension.AnnotationReservationAffinity]; reserveAffExist {
// for pod specified reservation affinity, export new event on reservation level
reservationLevelMsg, hasReservation := generatePodEventOnReservationLevel(schedulingErr.Error())
klog.V(7).Infof("origin scheduling error info: %s. hasReservation %v. reservation msg: %s",
schedulingErr.Error(), hasReservation, reservationLevelMsg)
if hasReservation {
msg := truncateMessage(reservationLevelMsg)
// user reason=FailedScheduling-Reservation to avoid event being auto-merged
fwk.EventRecorder().Eventf(pod, nil, corev1.EventTypeWarning, "FailedScheduling-Reservation", "Scheduling", msg)
}
return false
}
} else if reservationutil.IsReservePod(pod) {
// for reservation CR, which is treated as pod internal
reservationErrorFn(ctx, fwk, podInfo, status, nominatingInfo, start)

rName := reservationutil.GetReservationNameFromReservePod(pod)
r, err := reservationLister.Get(rName)
if err != nil {
return true
}

reservationErrorFn(ctx, fwk, podInfo, status, nominatingInfo, start)
msg := truncateMessage(schedulingErr.Error())
fwk.EventRecorder().Eventf(r, nil, corev1.EventTypeWarning, "FailedScheduling", "Scheduling", msg)

rName := reservationutil.GetReservationNameFromReservePod(pod)
r, err := reservationLister.Get(rName)
if err != nil {
updateReservationStatus(koordClientSet, reservationLister, rName, schedulingErr)
return true
}
// not reservation CR, not pod with reservation affinity
return false
}
}

msg := truncateMessage(schedulingErr.Error())
fwk.EventRecorder().Eventf(r, nil, corev1.EventTypeWarning, "FailedScheduling", "Scheduling", msg)

updateReservationStatus(koordClientSet, reservationLister, rName, schedulingErr)
return true
// input:
// "0/1 nodes are available: 3 Reservation(s) didn't match affinity rules, 1 Reservation(s) is unshedulable, 1 Reservation(s) is unavailable,
// 2 Reservation(s) Insufficient cpu, 1 Reservation(s) Insufficient memory, 1 Insufficient cpu, 1 Insufficient memory.
// 8 Reservation(s) matched owner total, Gang "default/demo-job-podgroup" gets rejected due to pod is unschedulable."
// output:
// "0/8 reservations are available: 3 Reservation(s) didn't match affinity rules, 1 Reservation(s) is unschedulable, 1 Reservation(s) is unavailable,
// 2 Reservation(s) Insufficient cpu, 1 Reservation(s) Insufficient memory."
func generatePodEventOnReservationLevel(errorMsg string) (string, bool) {
trimErrorMsg := strings.TrimSpace(errorMsg)
fitErrPrefix := regexp.MustCompile("^0/[0-9]+ nodes are available: ")

// expect: ["", "3 Reservation(s) ..."]
prefixSplit := fitErrPrefix.Split(trimErrorMsg, -1)
if len(prefixSplit) != 2 || prefixSplit[0] != "" {
return "", false
}

// "3 Reservations ..., 1 Reservation ..."
detailedMsg := prefixSplit[1]

splitFunc := func(c rune) bool {
detailSeparators := ",."
return strings.ContainsRune(detailSeparators, c)
}
// ["3 Reservation(s) ...", " 1 Reservation(s) ...", ..., " 8 Reservation(s) matched owner total", " Gang rejected..."]
detailSplit := strings.FieldsFunc(detailedMsg, splitFunc)

total := int64(-1)
resultDetails := make([]string, 0, len(detailSplit))

// for reservation total item
reserveTotalRe := regexp.MustCompile("^([0-9]+) Reservation\\(s\\) matched owner total$")

// for reservation detail item
reserveDetailRe := regexp.MustCompile("^([0-9]+) Reservation\\(s\\) .*$")

// for affinity item of node level
affinityPatterns := []string{
"^([0-9]+) node\\(s\\) (didn't match pod topology spread constraints \\(missing required label\\))",
"^([0-9]+) node\\(s\\) (didn't match pod topology spread constraints)",
"^([0-9]+) node\\(s\\) (didn't satisfy existing pods anti-affinity rules)",
"^([0-9]+) node\\(s\\) (didn't match pod affinity rules)",
"^([0-9]+) node\\(s\\) (didn't match pod anti-affinity rules)",
}
affinityDetailRe := regexp.MustCompile(strings.Join(affinityPatterns, "|"))

for _, item := range detailSplit {
trimItem := strings.TrimSpace(item)
totalStr := reserveTotalRe.FindAllStringSubmatch(trimItem, -1)

if len(totalStr) > 0 && len(totalStr[0]) == 2 {
// matched total item "8 Reservation(s) matched owner total"
var err error
if total, err = strconv.ParseInt(totalStr[0][1], 10, 64); err != nil {
return "", false
}
} else if reserveDetailRe.MatchString(trimItem) {
// not total item, append to details, e.g. " 1 Reservation(s) ..."
resultDetails = append(resultDetails, trimItem)
} else {
// other node items, record affinity errors on reservation level as:
// "at least 3 didn't match pod topology spread constraints Reservation(s)"
affinityDetailsSubMatch := affinityDetailRe.FindAllStringSubmatch(trimItem, -1)
if len(affinityDetailsSubMatch) == 0 {
continue
}
for _, submatch := range affinityDetailsSubMatch {
if len(submatch) <= 1 {
continue
}
r := &strings.Builder{}
r.WriteString("at least ")
for _, vv := range submatch[1:] {
if vv == "" {
continue
}
r.WriteString(vv + " ")
}
r.WriteString("Reservation(s)")
resultDetails = append(resultDetails, r.String())
}
}
}

reserveLevelMsgFmt := "0/%d reservations are available: %s."

return fmt.Sprintf(reserveLevelMsgFmt, total, strings.Join(resultDetails, ", ")), total >= 0
}

func makeReservationErrorFunc(sched frameworkext.Scheduler, reservationLister schedulingv1alpha1lister.ReservationLister) scheduler.FailureHandlerFn {
Expand Down
103 changes: 103 additions & 0 deletions pkg/scheduler/frameworkext/eventhandlers/reservation_handler_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1310,3 +1310,106 @@ func assertEqualReservationCondition(t *testing.T, expect, got *schedulingv1alph
assert.Equal(t, e.Reason, condition.Reason, msg)
}
}

func Test_generatePodEventOnReservationLevel(t *testing.T) {
tests := []struct {
name string
errorMsg string
wantMsg string
wantIsReserve bool
}{
{
name: "simple reservation errors",
errorMsg: "0/3 nodes are available: 1 Reservation(s) Insufficient cpu. 1 Reservation(s) matched owner total.",
wantMsg: "0/1 reservations are available: 1 Reservation(s) Insufficient cpu.",
wantIsReserve: true,
},
{
name: "extract reservation errors ",
errorMsg: "0/1 nodes are available: 3 Reservation(s) didn't match affinity rules, 1 Reservation(s) is unschedulable, " +
"1 Reservation(s) is unavailable, 2 Reservation(s) Insufficient cpu, 1 Reservation(s) Insufficient memory, " +
"1 Insufficient cpu, 1 Insufficient memory. 8 Reservation(s) matched owner total, " +
"Gang \"default/demo-job-podgroup\" gets rejected due to pod is unschedulable.",
wantMsg: "0/8 reservations are available: 3 Reservation(s) didn't match affinity rules, " +
"1 Reservation(s) is unschedulable, 1 Reservation(s) is unavailable, " +
"2 Reservation(s) Insufficient cpu, 1 Reservation(s) Insufficient memory.",
wantIsReserve: true,
},
{
name: "pod topology spread constraints missing required label errors",
errorMsg: "0/5 nodes are available: 3 node(s) didn't match pod topology spread constraints (missing required label)," +
"1 Insufficient cpu, 1 Insufficient memory, 2 Reservation(s) Insufficient cpu, 1 Reservation(s) Insufficient memory. " +
"8 Reservation(s) matched owner total.",
wantMsg: "0/8 reservations are available: at least 3 didn't match pod topology spread constraints (missing required label) Reservation(s), " +
"2 Reservation(s) Insufficient cpu, 1 Reservation(s) Insufficient memory.",
wantIsReserve: true,
},
{
name: "pod topology spread constraints errors",
errorMsg: "0/5 nodes are available: 3 node(s) didn't match pod topology spread constraints," +
"1 Insufficient cpu, 1 Insufficient memory, 2 Reservation(s) Insufficient cpu, 1 Reservation(s) Insufficient memory. " +
"8 Reservation(s) matched owner total.",
wantMsg: "0/8 reservations are available: at least 3 didn't match pod topology spread constraints Reservation(s), " +
"2 Reservation(s) Insufficient cpu, 1 Reservation(s) Insufficient memory.",
wantIsReserve: true,
},
{
name: "satisfy existing pods anti-affinity rules, errors",
errorMsg: "0/5 nodes are available: 3 node(s) didn't satisfy existing pods anti-affinity rules," +
"1 Insufficient cpu, 1 Insufficient memory, 2 Reservation(s) Insufficient cpu, 1 Reservation(s) Insufficient memory. " +
"8 Reservation(s) matched owner total.",
wantMsg: "0/8 reservations are available: at least 3 didn't satisfy existing pods anti-affinity rules Reservation(s), " +
"2 Reservation(s) Insufficient cpu, 1 Reservation(s) Insufficient memory.",
wantIsReserve: true,
},
{
name: "match pod affinity rules errors",
errorMsg: "0/5 nodes are available: 3 node(s) didn't match pod affinity rules," +
"1 Insufficient cpu, 1 Insufficient memory, 2 Reservation(s) Insufficient cpu, 1 Reservation(s) Insufficient memory. " +
"8 Reservation(s) matched owner total.",
wantMsg: "0/8 reservations are available: at least 3 didn't match pod affinity rules Reservation(s), " +
"2 Reservation(s) Insufficient cpu, 1 Reservation(s) Insufficient memory.",
wantIsReserve: true,
},
{
name: "match pod anti-affinity rules errors",
errorMsg: "0/5 nodes are available: 3 node(s) didn't match pod anti-affinity rules," +
"1 Insufficient cpu, 1 Insufficient memory, 2 Reservation(s) Insufficient cpu, 1 Reservation(s) Insufficient memory. " +
"8 Reservation(s) matched owner total.",
wantMsg: "0/8 reservations are available: at least 3 didn't match pod anti-affinity rules Reservation(s), " +
"2 Reservation(s) Insufficient cpu, 1 Reservation(s) Insufficient memory.",
wantIsReserve: true,
},
{
name: "mix affinity errors of 'match pod topology spread constraints' and 'match pod affinity rules'",
errorMsg: "0/5 nodes are available: 3 node(s) didn't match pod topology spread constraints, " +
"1 node(s) didn't match pod affinity rules, " +
"1 Insufficient memory, 2 Reservation(s) Insufficient cpu, 1 Reservation(s) Insufficient memory. " +
"8 Reservation(s) matched owner total.",
wantMsg: "0/8 reservations are available: at least 3 didn't match pod topology spread constraints Reservation(s), " +
"at least 1 didn't match pod affinity rules Reservation(s), " +
"2 Reservation(s) Insufficient cpu, 1 Reservation(s) Insufficient memory.",
wantIsReserve: true,
},
{
name: "only gang errors",
errorMsg: "Gang \"default/demo-job-podgroup\" gets rejected due to member Pod \"demo-job-kfqfs\" is" +
"unschedulable with reason \"0/3 nodes are available: 3 Insufficient cpu.\"",
wantIsReserve: false,
},
{
name: "only node errors",
errorMsg: `0/5 nodes are available: 3 Insufficient cpu, 2 Insufficient memory.`,
wantIsReserve: false,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
gotMsg, hasReserveMsg := generatePodEventOnReservationLevel(tt.errorMsg)
assert.Equal(t, tt.wantIsReserve, hasReserveMsg)
if hasReserveMsg {
assert.Equalf(t, tt.wantMsg, gotMsg, "generatePodEventOnReservationLevel(%v)", tt.errorMsg)
}
})
}
}
2 changes: 1 addition & 1 deletion pkg/scheduler/plugins/reservation/plugin_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1884,7 +1884,7 @@ func TestPostFilter(t *testing.T) {
want1: framework.NewStatus(framework.Unschedulable, "4 Reservation(s) is unschedulable", "4 Reservation(s) matched owner total"),
},
{
name: "show reservation owner matched, unschedulable and affinity unmatched",
name: "show reservation matched owner, unschedulable and affinity unmatched",
args: args{
hasStateData: true,
nodeReservationDiagnosis: map[string]nodeDiagnosisState{
Expand Down

0 comments on commit 6582887

Please sign in to comment.