Skip to content

Commit

Permalink
DRA: fix scheduler/resource claim controller race
Browse files Browse the repository at this point in the history
There was a race caused by having to update claim finalizer and status in two
different operations:
- Resource claim controller removes allocation, does not yet
  get to remove the finalizer.
- Scheduler prepares an allocation, without adding the finalizer
  because it's there.
- Controller removes finalizer.
- Scheduler adds allocation.

This is an invalid state. Automatic checking found this during the execution of
the "with translated parameters on single node.*supports sharing a claim
sequentially" E2E test, but only when run stand-alone. When running in
parallel (as in the CI), the bad outcome of the race did not occur.

The fix is to check that the finalizer is still set when adding the
allocation. The apiserver doesn't check that because it doesn't know which
finalizer goes with the allocation result. It could check for "some finalizer",
but that is not guaranteed to be correct (could be some unrelated one).

Checking the finalizer can only be done with a JSON patch. Despite the
complications, having the ability to add multiple pods concurrently to
ReservedFor seems worth it (avoids expensive rescheduling or a local retry
loop).

The resource claim controller doesn't need this, it can do a normal update
which implicitly checks ResourceVersion.
  • Loading branch information
pohly committed Jun 27, 2024
1 parent 29defc1 commit ecbafb8
Showing 1 changed file with 43 additions and 9 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ import (
"fmt"
"slices"
"sort"
"strings"
"sync"

"github.com/google/go-cmp/cmp"
Expand Down Expand Up @@ -1620,7 +1621,8 @@ func (pl *dynamicResources) PreBind(ctx context.Context, cs *framework.CycleStat
func (pl *dynamicResources) bindClaim(ctx context.Context, state *stateData, index int, pod *v1.Pod, nodeName string) (patchedClaim *resourcev1alpha2.ResourceClaim, finalErr error) {
logger := klog.FromContext(ctx)
claim := state.claims[index]
allocationPatch := ""
driverName := ""
allocationObject := ""

allocation := state.informationsForClaim[index].allocation
logger.V(5).Info("preparing claim status patch", "claim", klog.KObj(state.claims[index]), "allocation", klog.Format(allocation))
Expand All @@ -1631,7 +1633,8 @@ func (pl *dynamicResources) bindClaim(ctx context.Context, state *stateData, ind
if err != nil {
return nil, fmt.Errorf("marshaling AllocationResult failed: %v", err)
}
allocationPatch = fmt.Sprintf(`"driverName": %q, "allocation": %s, `, state.informationsForClaim[index].allocationDriverName, string(buffer))
driverName = state.informationsForClaim[index].allocationDriverName
allocationObject = string(buffer)

// The finalizer needs to be added in a normal update. Using a simple update is fine
// because we don't expect concurrent modifications while the claim is not allocated
Expand Down Expand Up @@ -1659,20 +1662,51 @@ func (pl *dynamicResources) bindClaim(ctx context.Context, state *stateData, ind
// Note that this also works when the allocation result gets added twice because
// two pods both started using a shared claim: the first pod to get here adds the
// allocation result. The second pod then only adds itself to reservedFor.
patch := fmt.Sprintf(`{"metadata": {"uid": %q}, "status": {%s "reservedFor": [ {"resource": "pods", "name": %q, "uid": %q} ] }}`,
claim.UID,
allocationPatch,
pod.Name,
pod.UID,
//
// Because we allow concurrent updates, we don't check ResourceVersion
// in a precondition. What we *do* need to check is that the finalizer
// is still present. This implies using a JSON patch instead of a
// simpler strategic-merge-patch.
parts := make([]string, 0, 7)
parts = append(parts,
fmt.Sprintf(`{ "op": "test", "path": "/metadata/uid", "value": %q }`, claim.UID),
)
// Strictly speaking, we only need to check for existence of our finalizer.
// JSON patch does not support that, so we check for the stronger "array is equal".
if len(claim.Finalizers) > 0 {
parts = append(parts,
fmt.Sprintf(`{ "op": "test", "path": "/metadata/finalizers", "value": %q }`, claim.Finalizers),
)
}
// JSON patch can only append to a non-empty array. An empty reservedFor gets
// omitted and even if it didn't, it would be null and not an empty array.
// Therefore we have to test and add if it's currently empty.
reservedForEntry := fmt.Sprintf(`{"resource": "pods", "name": %q, "uid": %q}`, pod.Name, pod.UID)
if len(claim.Status.ReservedFor) == 0 {
parts = append(parts,
fmt.Sprintf(`{ "op": "test", "path": "/status/reservedFor", "value": null }`),
fmt.Sprintf(`{ "op": "add", "path": "/status/reservedFor", "value": [ %s ] }`, reservedForEntry),
)
} else {
parts = append(parts,
fmt.Sprintf(`{ "op": "add", "path": "/status/reservedFor/-", "value": %s }`, reservedForEntry),
)
}
if allocationObject != "" {
parts = append(parts,
fmt.Sprintf(`{ "op": "add", "path": "/status/driverName", "value": %q }`, driverName),
fmt.Sprintf(`{ "op": "add", "path": "/status/allocation", "value": %s }`, allocationObject),
)
}
patch := "[\n" + strings.Join(parts, ",\n") + "\n]"
if loggerV := logger.V(6); loggerV.Enabled() {
logger.V(5).Info("reserve", "pod", klog.KObj(pod), "node", klog.ObjectRef{Name: nodeName}, "resourceclaim", klog.KObj(claim), "patch", patch)
} else {
logger.V(5).Info("reserve", "pod", klog.KObj(pod), "node", klog.ObjectRef{Name: nodeName}, "resourceclaim", klog.KObj(claim))
}
claim, err := pl.clientset.ResourceV1alpha2().ResourceClaims(claim.Namespace).Patch(ctx, claim.Name, types.StrategicMergePatchType, []byte(patch), metav1.PatchOptions{}, "status")
claim, err := pl.clientset.ResourceV1alpha2().ResourceClaims(claim.Namespace).Patch(ctx, claim.Name, types.JSONPatchType, []byte(patch), metav1.PatchOptions{}, "status")
logger.V(5).Info("reserved", "pod", klog.KObj(pod), "node", klog.ObjectRef{Name: nodeName}, "resourceclaim", klog.Format(claim), "err", err)
if allocationPatch != "" {
if allocationObject != "" {
// The scheduler was handling allocation. Now that has
// completed, either successfully or with a failure.
if err == nil {
Expand Down

0 comments on commit ecbafb8

Please sign in to comment.