Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

swarm: Add update/rollback order #30261

Merged
merged 2 commits into from
Apr 7, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions api/swagger.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2296,6 +2296,12 @@ definitions:
description: "The fraction of tasks that may fail during an update before the failure action is invoked, specified as a floating point number between 0 and 1."
type: "number"
default: 0
Order:
description: "The order of operations when rolling out an updated task. Either the old task is shut down before the new task is started, or the new task is started before the old task is shut down."
type: "string"
enum:
- "stop-first"
- "start-first"
RollbackConfig:
description: "Specification for the rollback strategy of the service."
type: "object"
Expand All @@ -2322,6 +2328,12 @@ definitions:
description: "The fraction of tasks that may fail during a rollback before the failure action is invoked, specified as a floating point number between 0 and 1."
type: "number"
default: 0
Order:
description: "The order of operations when rolling back a task. Either the old task is shut down before the new task is started, or the new task is started before the old task is shut down."
type: "string"
enum:
- "stop-first"
- "start-first"
Networks:
description: "Array of network names or IDs to attach the service to."
type: "array"
Expand Down
10 changes: 10 additions & 0 deletions api/types/swarm/service.go
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,11 @@ const (
UpdateFailureActionContinue = "continue"
// UpdateFailureActionRollback ROLLBACK
UpdateFailureActionRollback = "rollback"

// UpdateOrderStopFirst STOP_FIRST
UpdateOrderStopFirst = "stop-first"
// UpdateOrderStartFirst START_FIRST
UpdateOrderStartFirst = "start-first"
)

// UpdateConfig represents the update configuration.
Expand Down Expand Up @@ -111,4 +116,9 @@ type UpdateConfig struct {
// If the failure action is PAUSE, no more tasks will be updated until
// another update is started.
MaxFailureRatio float32

// Order indicates the order of operations when rolling out an updated
// task. Either the old task is shut down before the new task is
// started, or the new task is started before the old task is shut down.
Order string
}
10 changes: 10 additions & 0 deletions cli/command/formatter/service.go
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ UpdateConfig:
Monitoring Period: {{ .UpdateMonitor }}
{{- end }}
Max failure ratio: {{ .UpdateMaxFailureRatio }}
Update order: {{ .UpdateOrder }}
{{- end }}
{{- if .HasRollbackConfig }}
RollbackConfig:
Expand All @@ -69,6 +70,7 @@ RollbackConfig:
Monitoring Period: {{ .RollbackMonitor }}
{{- end }}
Max failure ratio: {{ .RollbackMaxFailureRatio }}
Rollback order: {{ .RollbackOrder }}
{{- end }}
ContainerSpec:
Image: {{ .ContainerImage }}
Expand Down Expand Up @@ -260,6 +262,10 @@ func (ctx *serviceInspectContext) UpdateOnFailure() string {
return ctx.Service.Spec.UpdateConfig.FailureAction
}

func (ctx *serviceInspectContext) UpdateOrder() string {
return ctx.Service.Spec.UpdateConfig.Order
}

func (ctx *serviceInspectContext) HasUpdateMonitor() bool {
return ctx.Service.Spec.UpdateConfig.Monitor.Nanoseconds() > 0
}
Expand Down Expand Up @@ -304,6 +310,10 @@ func (ctx *serviceInspectContext) RollbackMaxFailureRatio() float32 {
return ctx.Service.Spec.RollbackConfig.MaxFailureRatio
}

func (ctx *serviceInspectContext) RollbackOrder() string {
return ctx.Service.Spec.RollbackConfig.Order
}

func (ctx *serviceInspectContext) ContainerImage() string {
return ctx.Service.Spec.TaskTemplate.ContainerSpec.Image
}
Expand Down
8 changes: 8 additions & 0 deletions cli/command/service/opts.go
Original file line number Diff line number Diff line change
Expand Up @@ -188,6 +188,7 @@ type updateOptions struct {
monitor time.Duration
onFailure string
maxFailureRatio floatValue
order string
}

func (opts updateOptions) config() *swarm.UpdateConfig {
Expand All @@ -197,6 +198,7 @@ func (opts updateOptions) config() *swarm.UpdateConfig {
Monitor: opts.monitor,
FailureAction: opts.onFailure,
MaxFailureRatio: opts.maxFailureRatio.Value(),
Order: opts.order,
}
}

Expand Down Expand Up @@ -533,6 +535,8 @@ func addServiceFlags(flags *pflag.FlagSet, opts *serviceOptions) {
flags.StringVar(&opts.update.onFailure, flagUpdateFailureAction, "pause", `Action on update failure ("pause"|"continue"|"rollback")`)
flags.Var(&opts.update.maxFailureRatio, flagUpdateMaxFailureRatio, "Failure rate to tolerate during an update")
flags.SetAnnotation(flagUpdateMaxFailureRatio, "version", []string{"1.25"})
flags.StringVar(&opts.update.order, flagUpdateOrder, "stop-first", `Update order ("start-first"|"stop-first")`)
flags.SetAnnotation(flagUpdateOrder, "version", []string{"1.29"})

flags.Uint64Var(&opts.rollback.parallelism, flagRollbackParallelism, 1, "Maximum number of tasks rolled back simultaneously (0 to roll back all at once)")
flags.SetAnnotation(flagRollbackParallelism, "version", []string{"1.28"})
Expand All @@ -544,6 +548,8 @@ func addServiceFlags(flags *pflag.FlagSet, opts *serviceOptions) {
flags.SetAnnotation(flagRollbackFailureAction, "version", []string{"1.28"})
flags.Var(&opts.rollback.maxFailureRatio, flagRollbackMaxFailureRatio, "Failure rate to tolerate during a rollback")
flags.SetAnnotation(flagRollbackMaxFailureRatio, "version", []string{"1.28"})
flags.StringVar(&opts.rollback.order, flagRollbackOrder, "stop-first", `Rollback order ("start-first"|"stop-first")`)
flags.SetAnnotation(flagRollbackOrder, "version", []string{"1.29"})

flags.StringVar(&opts.endpoint.mode, flagEndpointMode, "vip", "Endpoint mode (vip or dnsrr)")

Expand Down Expand Up @@ -633,6 +639,7 @@ const (
flagRollbackFailureAction = "rollback-failure-action"
flagRollbackMaxFailureRatio = "rollback-max-failure-ratio"
flagRollbackMonitor = "rollback-monitor"
flagRollbackOrder = "rollback-order"
flagRollbackParallelism = "rollback-parallelism"
flagStopGracePeriod = "stop-grace-period"
flagStopSignal = "stop-signal"
Expand All @@ -641,6 +648,7 @@ const (
flagUpdateFailureAction = "update-failure-action"
flagUpdateMaxFailureRatio = "update-max-failure-ratio"
flagUpdateMonitor = "update-monitor"
flagUpdateOrder = "update-order"
flagUpdateParallelism = "update-parallelism"
flagUser = "user"
flagWorkdir = "workdir"
Expand Down
6 changes: 4 additions & 2 deletions cli/command/service/update.go
Original file line number Diff line number Diff line change
Expand Up @@ -320,7 +320,7 @@ func updateService(flags *pflag.FlagSet, spec *swarm.ServiceSpec) error {
return err
}

if anyChanged(flags, flagUpdateParallelism, flagUpdateDelay, flagUpdateMonitor, flagUpdateFailureAction, flagUpdateMaxFailureRatio) {
if anyChanged(flags, flagUpdateParallelism, flagUpdateDelay, flagUpdateMonitor, flagUpdateFailureAction, flagUpdateMaxFailureRatio, flagUpdateOrder) {
if spec.UpdateConfig == nil {
spec.UpdateConfig = &swarm.UpdateConfig{}
}
Expand All @@ -329,9 +329,10 @@ func updateService(flags *pflag.FlagSet, spec *swarm.ServiceSpec) error {
updateDuration(flagUpdateMonitor, &spec.UpdateConfig.Monitor)
updateString(flagUpdateFailureAction, &spec.UpdateConfig.FailureAction)
updateFloatValue(flagUpdateMaxFailureRatio, &spec.UpdateConfig.MaxFailureRatio)
updateString(flagUpdateOrder, &spec.UpdateConfig.Order)
}

if anyChanged(flags, flagRollbackParallelism, flagRollbackDelay, flagRollbackMonitor, flagRollbackFailureAction, flagRollbackMaxFailureRatio) {
if anyChanged(flags, flagRollbackParallelism, flagRollbackDelay, flagRollbackMonitor, flagRollbackFailureAction, flagRollbackMaxFailureRatio, flagRollbackOrder) {
if spec.RollbackConfig == nil {
spec.RollbackConfig = &swarm.UpdateConfig{}
}
Expand All @@ -340,6 +341,7 @@ func updateService(flags *pflag.FlagSet, spec *swarm.ServiceSpec) error {
updateDuration(flagRollbackMonitor, &spec.RollbackConfig.Monitor)
updateString(flagRollbackFailureAction, &spec.RollbackConfig.FailureAction)
updateFloatValue(flagRollbackMaxFailureRatio, &spec.RollbackConfig.MaxFailureRatio)
updateString(flagRollbackOrder, &spec.RollbackConfig.Order)
}

if flags.Changed(flagEndpointMode) {
Expand Down
18 changes: 17 additions & 1 deletion daemon/cluster/convert/service.go
Original file line number Diff line number Diff line change
Expand Up @@ -393,6 +393,13 @@ func updateConfigFromGRPC(updateConfig *swarmapi.UpdateConfig) *types.UpdateConf
converted.FailureAction = types.UpdateFailureActionRollback
}

switch updateConfig.Order {
case swarmapi.UpdateConfig_STOP_FIRST:
converted.Order = types.UpdateOrderStopFirst
case swarmapi.UpdateConfig_START_FIRST:
converted.Order = types.UpdateOrderStartFirst
}

return converted
}

Expand All @@ -415,12 +422,21 @@ func updateConfigToGRPC(updateConfig *types.UpdateConfig) (*swarmapi.UpdateConfi
case types.UpdateFailureActionRollback:
converted.FailureAction = swarmapi.UpdateConfig_ROLLBACK
default:
return nil, fmt.Errorf("unrecongized update failure action %s", updateConfig.FailureAction)
return nil, fmt.Errorf("unrecognized update failure action %s", updateConfig.FailureAction)
}
if updateConfig.Monitor != 0 {
converted.Monitor = gogotypes.DurationProto(updateConfig.Monitor)
}

switch updateConfig.Order {
case types.UpdateOrderStopFirst, "":
converted.Order = swarmapi.UpdateConfig_STOP_FIRST
case types.UpdateOrderStartFirst:
converted.Order = swarmapi.UpdateConfig_START_FIRST
default:
return nil, fmt.Errorf("unrecognized update order %s", updateConfig.Order)
}

return converted, nil
}

Expand Down
2 changes: 2 additions & 0 deletions docs/reference/commandline/service_create.md
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@ Options:
--rollback-max-failure-ratio float Failure rate to tolerate during a rollback
--rollback-monitor duration Duration after each task rollback to monitor for failure
(ns|us|ms|s|m|h) (default 0s)
--rollback-order string Rollback order ("start-first"|"stop-first") (default "stop-first")
--rollback-parallelism uint Maximum number of tasks rolled back simultaneously (0 to roll
back all at once) (default 1)
--secret secret Specify secrets to expose to the service
Expand All @@ -75,6 +76,7 @@ Options:
--update-failure-action string Action on update failure ("pause"|"continue"|"rollback") (default "pause")
--update-max-failure-ratio float Failure rate to tolerate during an update
--update-monitor duration Duration after each task update to monitor for failure (ns|us|ms|s|m|h)
--update-order string Update order ("start-first"|"stop-first") (default "stop-first")
--update-parallelism uint Maximum number of tasks updated simultaneously (0 to update all at once) (default 1)
-u, --user string Username or UID (format: <name|uid>[:<group|gid>])
--with-registry-auth Send registry authentication details to swarm agents
Expand Down
2 changes: 2 additions & 0 deletions docs/reference/commandline/service_update.md
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,7 @@ Options:
--rollback-max-failure-ratio float Failure rate to tolerate during a rollback
--rollback-monitor duration Duration after each task rollback to monitor for failure
(ns|us|ms|s|m|h) (default 0s)
--rollback-order string Rollback order ("start-first"|"stop-first") (default "stop-first")
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Stating default in service update command is a little misleading. If you don't specify the flag, it keeps existing value.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Agreed, however the other --update-* flags have the same problem.

      --update-delay duration              Delay between updates (ns|us|ms|s|m|h) (default 0s)
      --update-failure-action string       Action on update failure ("pause"|"continue"|"rollback") (default "pause")
      --update-monitor duration            Duration after each task update to monitor for failure (ns|us|ms|s|m|h)
      --update-parallelism uint            Maximum number of tasks updated simultaneously (0 to update all at once) (default 1)

Let's deal with this in a separate PR. Would you mind filing an issue?

--rollback-parallelism uint Maximum number of tasks rolled back simultaneously (0 to roll
back all at once) (default 1)
--secret-add secret Add or update a secret on a service
Expand All @@ -88,6 +89,7 @@ Options:
--update-failure-action string Action on update failure ("pause"|"continue"|"rollback") (default "pause")
--update-max-failure-ratio float Failure rate to tolerate during an update
--update-monitor duration Duration after each task update to monitor for failure (ns|us|ms|s|m|h)
--update-order string Update order ("start-first"|"stop-first") (default "stop-first")
--update-parallelism uint Maximum number of tasks updated simultaneously (0 to update all at once) (default 1)
-u, --user string Username or UID (format: <name|uid>[:<group|gid>])
--with-registry-auth Send registry authentication details to swarm agents
Expand Down
109 changes: 109 additions & 0 deletions integration-cli/docker_api_swarm_service_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -175,6 +175,115 @@ func (s *DockerSwarmSuite) TestAPISwarmServicesUpdate(c *check.C) {
map[string]int{image1: instances})
}

func (s *DockerSwarmSuite) TestAPISwarmServicesUpdateStartFirst(c *check.C) {
d := s.AddDaemon(c, true, true)

// service image at start
image1 := "busybox:latest"
// target image in update
image2 := "testhealth"

// service started from this image won't pass health check
_, _, err := d.BuildImageWithOut(image2,
`FROM busybox
HEALTHCHECK --interval=1s --timeout=1s --retries=1024\
CMD cat /status`,
true)
c.Check(err, check.IsNil)

// create service
instances := 5
parallelism := 2
rollbackParallelism := 3
id := d.CreateService(c, serviceForUpdate, setInstances(instances), setUpdateOrder(swarm.UpdateOrderStartFirst), setRollbackOrder(swarm.UpdateOrderStartFirst))

checkStartingTasks := func(expected int) []swarm.Task {
var startingTasks []swarm.Task
waitAndAssert(c, defaultReconciliationTimeout, func(c *check.C) (interface{}, check.CommentInterface) {
tasks := d.GetServiceTasks(c, id)
startingTasks = nil
for _, t := range tasks {
if t.Status.State == swarm.TaskStateStarting {
startingTasks = append(startingTasks, t)
}
}
return startingTasks, nil
}, checker.HasLen, expected)

return startingTasks
}

makeTasksHealthy := func(tasks []swarm.Task) {
for _, t := range tasks {
containerID := t.Status.ContainerStatus.ContainerID
d.Cmd("exec", containerID, "touch", "/status")
}
}

// wait for tasks ready
waitAndAssert(c, defaultReconciliationTimeout, d.CheckRunningTaskImages, checker.DeepEquals,
map[string]int{image1: instances})

// issue service update
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is admittedly quite nit picky, but I personally don't find these inline comments provide much value. Many of them just echo the function name ("make it healthy", "wait for tasks ready", "update service"). These can lead to confusion when the code is modified and the comments no longer reflect reality.

I subscribe to the school of thought that inline comments should generally be avoided. They can be used sparingly when there is a very unexpected line that needs explanation. Instead of inline comments, code should be structured to reveal it's intent. Comments on functions and structs on the other hand are great, because they can be used to generate documentation, and they apply to a well defined block of code (the function body or struct definition).

Applying this rule here, some of these comments could be deleted (as the function names are already explicit enough), others (like "wait for tasks ready") could be replaced by extracting a new function, which would also remove duplication:

func waitForTasksReady(c *check.C, tasks map[string]int) {
   ...
}

Are there benefits to this style of commenting that I'm missing?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I mostly agree. Note that the comments were copied from TestServiceHealthStart and TestAPISwarmServicesUpdate rather than being original in this new test.

service := d.GetService(c, id)
d.UpdateService(c, service, setImage(image2))

// first batch

// The old tasks should be running, and the new ones should be starting.
startingTasks := checkStartingTasks(parallelism)

waitAndAssert(c, defaultReconciliationTimeout, d.CheckRunningTaskImages, checker.DeepEquals,
map[string]int{image1: instances})

// make it healthy
makeTasksHealthy(startingTasks)

waitAndAssert(c, defaultReconciliationTimeout, d.CheckRunningTaskImages, checker.DeepEquals,
map[string]int{image1: instances - parallelism, image2: parallelism})

// 2nd batch

// The old tasks should be running, and the new ones should be starting.
startingTasks = checkStartingTasks(parallelism)

waitAndAssert(c, defaultReconciliationTimeout, d.CheckRunningTaskImages, checker.DeepEquals,
map[string]int{image1: instances - parallelism, image2: parallelism})

// make it healthy
makeTasksHealthy(startingTasks)

waitAndAssert(c, defaultReconciliationTimeout, d.CheckRunningTaskImages, checker.DeepEquals,
map[string]int{image1: instances - 2*parallelism, image2: 2 * parallelism})

// 3nd batch

// The old tasks should be running, and the new ones should be starting.
startingTasks = checkStartingTasks(1)

waitAndAssert(c, defaultReconciliationTimeout, d.CheckRunningTaskImages, checker.DeepEquals,
map[string]int{image1: instances - 2*parallelism, image2: 2 * parallelism})

// make it healthy
makeTasksHealthy(startingTasks)

waitAndAssert(c, defaultReconciliationTimeout, d.CheckRunningTaskImages, checker.DeepEquals,
map[string]int{image2: instances})

// Roll back to the previous version. This uses the CLI because
// rollback is a client-side operation.
out, err := d.Cmd("service", "update", "--rollback", id)
c.Assert(err, checker.IsNil, check.Commentf(out))

// first batch
waitAndAssert(c, defaultReconciliationTimeout, d.CheckRunningTaskImages, checker.DeepEquals,
map[string]int{image2: instances - rollbackParallelism, image1: rollbackParallelism})

// 2nd batch
waitAndAssert(c, defaultReconciliationTimeout, d.CheckRunningTaskImages, checker.DeepEquals,
map[string]int{image1: instances})
}

func (s *DockerSwarmSuite) TestAPISwarmServicesFailedUpdate(c *check.C) {
const nodeCount = 3
var daemons [nodeCount]*daemon.Swarm
Expand Down
18 changes: 18 additions & 0 deletions integration-cli/docker_api_swarm_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -596,6 +596,24 @@ func setInstances(replicas int) daemon.ServiceConstructor {
}
}

func setUpdateOrder(order string) daemon.ServiceConstructor {
return func(s *swarm.Service) {
if s.Spec.UpdateConfig == nil {
s.Spec.UpdateConfig = &swarm.UpdateConfig{}
}
s.Spec.UpdateConfig.Order = order
}
}

func setRollbackOrder(order string) daemon.ServiceConstructor {
return func(s *swarm.Service) {
if s.Spec.RollbackConfig == nil {
s.Spec.RollbackConfig = &swarm.UpdateConfig{}
}
s.Spec.RollbackConfig.Order = order
}
}

func setImage(image string) daemon.ServiceConstructor {
return func(s *swarm.Service) {
s.Spec.TaskTemplate.ContainerSpec.Image = image
Expand Down