Skip to content

Commit

Permalink
Jobspec checklist
Browse files Browse the repository at this point in the history
  • Loading branch information
DerekStrickland committed Mar 3, 2022
1 parent ef4f8b7 commit 20e2a22
Show file tree
Hide file tree
Showing 8 changed files with 174 additions and 0 deletions.
1 change: 1 addition & 0 deletions api/tasks.go
Original file line number Diff line number Diff line change
Expand Up @@ -431,6 +431,7 @@ type TaskGroup struct {
Services []*Service `hcl:"service,block"`
ShutdownDelay *time.Duration `mapstructure:"shutdown_delay" hcl:"shutdown_delay,optional"`
StopAfterClientDisconnect *time.Duration `mapstructure:"stop_after_client_disconnect" hcl:"stop_after_client_disconnect,optional"`
MaxClientDisconnect *time.Duration `mapstructure:"max_client_disconnect" hcl:"max_client_disconnect,optional"`
Scaling *ScalingPolicy `hcl:"scaling,block"`
Consul *Consul `hcl:"consul,block"`
}
Expand Down
4 changes: 4 additions & 0 deletions command/agent/job_endpoint.go
Original file line number Diff line number Diff line change
Expand Up @@ -972,6 +972,10 @@ func ApiTgToStructsTG(job *structs.Job, taskGroup *api.TaskGroup, tg *structs.Ta
tg.StopAfterClientDisconnect = taskGroup.StopAfterClientDisconnect
}

if taskGroup.MaxClientDisconnect != nil {
tg.MaxClientDisconnect = taskGroup.MaxClientDisconnect
}

if taskGroup.ReschedulePolicy != nil {
tg.ReschedulePolicy = &structs.ReschedulePolicy{
Attempts: *taskGroup.ReschedulePolicy.Attempts,
Expand Down
2 changes: 2 additions & 0 deletions command/agent/job_endpoint_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -2445,6 +2445,7 @@ func TestJobs_ApiJobToStructsJob(t *testing.T) {
},
},
},
MaxClientDisconnect: helper.TimeToPtr(30 * time.Second),
Tasks: []*api.Task{
{
Name: "task1",
Expand Down Expand Up @@ -2840,6 +2841,7 @@ func TestJobs_ApiJobToStructsJob(t *testing.T) {
},
},
},
MaxClientDisconnect: helper.TimeToPtr(30 * time.Second),
Tasks: []*structs.Task{
{
Name: "task1",
Expand Down
14 changes: 14 additions & 0 deletions nomad/structs/diff.go
Original file line number Diff line number Diff line change
Expand Up @@ -265,6 +265,20 @@ func (tg *TaskGroup) Diff(other *TaskGroup, contextual bool) (*TaskGroupDiff, er
}
}

// MaxClientDisconnect diff
if oldPrimitiveFlat != nil && newPrimitiveFlat != nil {
if tg.MaxClientDisconnect == nil {
oldPrimitiveFlat["MaxClientDisconnect"] = ""
} else {
oldPrimitiveFlat["MaxClientDisconnect"] = fmt.Sprintf("%d", *tg.MaxClientDisconnect)
}
if other.MaxClientDisconnect == nil {
newPrimitiveFlat["MaxClientDisconnect"] = ""
} else {
newPrimitiveFlat["MaxClientDisconnect"] = fmt.Sprintf("%d", *other.MaxClientDisconnect)
}
}

// Diff the primitive fields.
diff.Fields = fieldDiffs(oldPrimitiveFlat, newPrimitiveFlat, false)

Expand Down
69 changes: 69 additions & 0 deletions nomad/structs/diff_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -3899,6 +3899,75 @@ func TestTaskGroupDiff(t *testing.T) {
},
},
},
{
TestCase: "MaxClientDisconnect added",
Old: &TaskGroup{
Name: "foo",
MaxClientDisconnect: nil,
},
New: &TaskGroup{
Name: "foo",
MaxClientDisconnect: helper.TimeToPtr(20 * time.Second),
},
Expected: &TaskGroupDiff{
Type: DiffTypeEdited,
Name: "foo",
Fields: []*FieldDiff{
{
Type: DiffTypeAdded,
Name: "MaxClientDisconnect",
Old: "",
New: "20000000000",
},
},
},
},
{
TestCase: "MaxClientDisconnect updated",
Old: &TaskGroup{
Name: "foo",
MaxClientDisconnect: helper.TimeToPtr(10 * time.Second),
},
New: &TaskGroup{
Name: "foo",
MaxClientDisconnect: helper.TimeToPtr(20 * time.Second),
},
Expected: &TaskGroupDiff{
Type: DiffTypeEdited,
Name: "foo",
Fields: []*FieldDiff{
{
Type: DiffTypeEdited,
Name: "MaxClientDisconnect",
Old: "10000000000",
New: "20000000000",
},
},
},
},
{
TestCase: "MaxClientDisconnect deleted",
Old: &TaskGroup{
Name: "foo",
MaxClientDisconnect: helper.TimeToPtr(10 * time.Second),
},
New: &TaskGroup{
Name: "foo",
MaxClientDisconnect: nil,
},
Expected: &TaskGroupDiff{
Type: DiffTypeEdited,
Name: "foo",
Fields: []*FieldDiff{
{
Type: DiffTypeDeleted,
Name: "MaxClientDisconnect",
Old: "10000000000",
New: "",
},
},
},
},
}

for i, c := range cases {
Expand Down
8 changes: 8 additions & 0 deletions nomad/structs/structs.go
Original file line number Diff line number Diff line change
Expand Up @@ -4359,6 +4359,10 @@ func (j *Job) Validate() error {
}
}

if tg.MaxClientDisconnect != nil && *tg.MaxClientDisconnect < 0 {
mErr.Errors = append(mErr.Errors, errors.New("max_client_disconnect must be a positive value"))
}

if j.Type == "system" && tg.Count > 1 {
mErr.Errors = append(mErr.Errors,
fmt.Errorf("Job task group %s has count %d. Count cannot exceed 1 with system scheduler",
Expand Down Expand Up @@ -6145,6 +6149,10 @@ func (tg *TaskGroup) Copy() *TaskGroup {
ntg.StopAfterClientDisconnect = tg.StopAfterClientDisconnect
}

if tg.MaxClientDisconnect != nil {
ntg.MaxClientDisconnect = tg.MaxClientDisconnect
}

return ntg
}

Expand Down
58 changes: 58 additions & 0 deletions nomad/structs/structs_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -5036,6 +5036,47 @@ func TestAllocation_WaitClientStop(t *testing.T) {
}
}

func TestAllocation_DisconnectTimeout(t *testing.T) {
type testCase struct {
desc string
maxDisconnect *time.Duration
}

testCases := []testCase{
{
desc: "no max_client_disconnect",
maxDisconnect: nil,
},
{
desc: "has max_client_disconnect",
maxDisconnect: helper.TimeToPtr(30 * time.Second),
},
}
for _, tc := range testCases {
t.Run(tc.desc, func(t *testing.T) {
j := testJob()
a := &Allocation{
Job: j,
}

j.TaskGroups[0].MaxClientDisconnect = tc.maxDisconnect
a.TaskGroup = j.TaskGroups[0].Name

now := time.Now()

reschedTime := a.DisconnectTimeout(now)

if tc.maxDisconnect == nil {
require.Equal(t, now, reschedTime, "expected to be now")
} else {
difference := reschedTime.Sub(now)
require.Equal(t, *tc.maxDisconnect, difference, "expected durations to be equal")
}

})
}
}

func TestAllocation_Canonicalize_Old(t *testing.T) {
alloc := MockAlloc()
alloc.AllocatedResources = nil
Expand Down Expand Up @@ -5207,6 +5248,23 @@ func TestJobConfig_Validate_StopAferClientDisconnect(t *testing.T) {
require.NoError(t, err)
}

func TestJobConfig_Validate_MaxClientDisconnect(t *testing.T) {
// Set up a job with an invalid max_client_disconnect value
job := testJob()
timeout := -1 * time.Minute
job.TaskGroups[0].MaxClientDisconnect = &timeout

err := job.Validate()
require.Error(t, err)
require.Contains(t, err.Error(), "max_client_disconnect must be a positive value")

// Modify the job with a valid max_client_disconnect value
timeout = 1 * time.Minute
job.TaskGroups[0].MaxClientDisconnect = &timeout
err = job.Validate()
require.NoError(t, err)
}

func TestParameterizedJobConfig_Canonicalize(t *testing.T) {
d := &ParameterizedJobConfig{}
d.Canonicalize()
Expand Down
18 changes: 18 additions & 0 deletions website/content/docs/job-specification/group.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,24 @@ job "docs" {
specified, the Nomad client will stop them after this duration. The
Nomad client process must be running for this to occur.

- `max_client_disconnect` `(string: "")` - Specifies a duration
during which a Nomad client that reconnects after failing it's
[`heartbeat_grace`] window with the servers will attempt to reconnect
allocations based on this task group. By default, allocations running
on a client that fails to heartbeat to a server within the
[`heartbeat_grace`] window will be marked "lost" and Nomad will schedule
replacement allocations. However, these allocations will continue to run
on the disconnected client. The allocations, which may still be healthy,
will be stopped once the client reconnects. An operator may instead desire
that these allocations reconnect without a restart once the client reconnects.
When specified, the Nomad server will mark these allocations as "unknown".
Replacement allocations will be scheduled according to their reschedule policy
until the disconnected client reconnect. Once the disconnected client reconnects,
Nomad will compare the "unknown" allocations with their replacements and keep
the one with the better node score. If the duration expires before the
client reconnects, the allocations will be marked "lost" and Nomad will
schedule replacement allocations if necessary.

- `task` <code>([Task][]: &lt;required&gt;)</code> - Specifies one or more tasks to run
within this group. This can be specified multiple times, to add a task as part
of the group.
Expand Down

0 comments on commit 20e2a22

Please sign in to comment.