Skip to content

Commit

Permalink
Work in progress - force rescheduling of failed allocs
Browse files Browse the repository at this point in the history
  • Loading branch information
preetapan committed May 8, 2018
1 parent ac2f070 commit 242cc19
Show file tree
Hide file tree
Showing 4 changed files with 76 additions and 9 deletions.
8 changes: 7 additions & 1 deletion command/agent/job_endpoint.go
Original file line number Diff line number Diff line change
Expand Up @@ -90,8 +90,14 @@ func (s *HTTPServer) jobForceEvaluate(resp http.ResponseWriter, req *http.Reques
if req.Method != "PUT" && req.Method != "POST" {
return nil, CodedError(405, ErrInvalidMethod)
}

evalOptions := structs.EvalOptions{}
if _, ok := req.URL.Query()["force"]; ok {
evalOptions.ForceReschedule = true
}
args := structs.JobEvaluateRequest{
JobID: jobName,
JobID: jobName,
EvalOptions: evalOptions,
}
s.parseWriteRequest(req, &args.WriteRequest)

Expand Down
39 changes: 34 additions & 5 deletions nomad/job_endpoint.go
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,13 @@ var (
RTarget: ">= 0.6.1",
Operand: structs.ConstraintVersion,
}

// allowRescheduleTransition is the transition that allows failed
// allocations to be force rescheduled. We create a one off
// variable to avoid creating a new object for every request.
allowForceRescheduleTransition = &structs.DesiredTransition{
ForceReschedule: helper.BoolToPtr(true),
}
)

// Job endpoint is used for job interactions
Expand Down Expand Up @@ -538,6 +545,27 @@ func (j *Job) Evaluate(args *structs.JobEvaluateRequest, reply *structs.JobRegis
return fmt.Errorf("can't evaluate parameterized job")
}

forceRescheduleAllocs := make(map[string]*structs.DesiredTransition)
if args.EvalOptions.ForceReschedule {
// Find any failed allocs that could be force rescheduled
allocs, err := snap.AllocsByJob(ws, args.RequestNamespace(), args.JobID, false)
if err != nil {
return err
}

for _, alloc := range allocs {
taskGroup := job.LookupTaskGroup(alloc.TaskGroup)
// Forcing rescheduling is only allowed if task group has rescheduling enabled
if taskGroup != nil && taskGroup.ReschedulePolicy != nil && taskGroup.ReschedulePolicy.Enabled() {
continue
}

if alloc.NextAllocation == "" && alloc.ClientStatus == structs.AllocClientStatusFailed {
forceRescheduleAllocs[alloc.ID] = allowForceRescheduleTransition
}
}
}

// Create a new evaluation
eval := &structs.Evaluation{
ID: uuid.Generate(),
Expand All @@ -549,13 +577,14 @@ func (j *Job) Evaluate(args *structs.JobEvaluateRequest, reply *structs.JobRegis
JobModifyIndex: job.ModifyIndex,
Status: structs.EvalStatusPending,
}
update := &structs.EvalUpdateRequest{
Evals: []*structs.Evaluation{eval},
WriteRequest: structs.WriteRequest{Region: args.Region},

// Create a AllocUpdateDesiredTransitionRequest request with the eval and any forced rescheduled allocs
updateTransitionReq := &structs.AllocUpdateDesiredTransitionRequest{
Allocs: forceRescheduleAllocs,
Evals: []*structs.Evaluation{eval},
}
_, evalIndex, err := j.srv.raftApply(structs.AllocUpdateDesiredTransitionRequestType, updateTransitionReq)

// Commit this evaluation via Raft
_, evalIndex, err := j.srv.raftApply(structs.EvalUpdateRequestType, update)
if err != nil {
j.srv.logger.Printf("[ERR] nomad.job: Eval create failed: %v", err)
return err
Expand Down
33 changes: 30 additions & 3 deletions nomad/structs/structs.go
Original file line number Diff line number Diff line change
Expand Up @@ -465,10 +465,15 @@ type JobDeregisterOptions struct {

// JobEvaluateRequest is used when we just need to re-evaluate a target job
type JobEvaluateRequest struct {
JobID string
JobID string
EvalOptions EvalOptions
WriteRequest
}

type EvalOptions struct {
ForceReschedule bool
}

// JobSpecificRequest is used when we just need to specify a target job
type JobSpecificRequest struct {
JobID string
Expand Down Expand Up @@ -2988,13 +2993,17 @@ func (r *ReschedulePolicy) Copy() *ReschedulePolicy {
return nrp
}

func (r *ReschedulePolicy) Enabled() bool {
enabled := r != nil && (r.Attempts > 0 || r.Unlimited)
return enabled
}

// Validate uses different criteria to validate the reschedule policy
// Delay must be a minimum of 5 seconds
// Delay Ceiling is ignored if Delay Function is "constant"
// Number of possible attempts is validated, given the interval, delay and delay function
func (r *ReschedulePolicy) Validate() error {
enabled := r != nil && (r.Attempts > 0 || r.Unlimited)
if !enabled {
if !r.Enabled() {
return nil
}
var mErr multierror.Error
Expand Down Expand Up @@ -5608,6 +5617,11 @@ type DesiredTransition struct {
// automatically eligible. An example is an allocation that is part of a
// deployment.
Reschedule *bool

// ForceReschedule is used to indicate that this allocation must be rescheduled.
// This field is only used when operators want to force a placement even if
// a failed allocation is not eligible to be rescheduled
ForceReschedule *bool
}

// Merge merges the two desired transitions, preferring the values from the
Expand All @@ -5620,6 +5634,10 @@ func (d *DesiredTransition) Merge(o *DesiredTransition) {
if o.Reschedule != nil {
d.Reschedule = o.Reschedule
}

if o.ForceReschedule != nil {
d.ForceReschedule = o.ForceReschedule
}
}

// ShouldMigrate returns whether the transition object dictates a migration.
Expand All @@ -5633,6 +5651,15 @@ func (d *DesiredTransition) ShouldReschedule() bool {
return d.Reschedule != nil && *d.Reschedule
}

// ShouldForceReschedule returns whether the transition object dictates a
// forced rescheduling.
func (d *DesiredTransition) ShouldForceReschedule() bool {
if d == nil {
return false
}
return d.ForceReschedule != nil && *d.ForceReschedule
}

const (
AllocDesiredStatusRun = "run" // Allocation should run
AllocDesiredStatusStop = "stop" // Allocation should stop
Expand Down
5 changes: 5 additions & 0 deletions scheduler/reconcile_util.go
Original file line number Diff line number Diff line change
Expand Up @@ -327,6 +327,11 @@ func updateByReschedulable(alloc *structs.Allocation, now time.Time, evalID stri
return
}

// Check if the allocation is marked as it should be force rescheduled
if alloc.DesiredTransition.ShouldForceReschedule() {
rescheduleNow = true
}

// Reschedule if the eval ID matches the alloc's followup evalID or if its close to its reschedule time
rescheduleTime, eligible := alloc.NextRescheduleTime()
if eligible && (alloc.FollowupEvalID == evalID || rescheduleTime.Sub(now) <= rescheduleWindowSize) {
Expand Down

0 comments on commit 242cc19

Please sign in to comment.