From a2260b1b1a3b26a2bd32c1d745a4f848acafa400 Mon Sep 17 00:00:00 2001 From: Chris Thielen Date: Mon, 24 Feb 2020 10:05:00 -0800 Subject: [PATCH] fix(health): When waiting for down instances, consider 'Starting' to be down. (#3455) If an app is stuck in Starting (due to a bug, etc), and an operator tries to fix this by deploying a new version, spinnaker would disable the cluster, but then time out waiting for the instances to go Down because it didn't consider Starting as a down state Co-authored-by: Adam Jordens --- .../clouddriver/utils/HealthHelper.groovy | 2 +- .../WaitForClusterDisableTaskSpec.groovy | 47 ++++++++++--------- 2 files changed, 25 insertions(+), 24 deletions(-) diff --git a/orca-clouddriver/src/main/groovy/com/netflix/spinnaker/orca/clouddriver/utils/HealthHelper.groovy b/orca-clouddriver/src/main/groovy/com/netflix/spinnaker/orca/clouddriver/utils/HealthHelper.groovy index 9b0f170b81..37837672bb 100644 --- a/orca-clouddriver/src/main/groovy/com/netflix/spinnaker/orca/clouddriver/utils/HealthHelper.groovy +++ b/orca-clouddriver/src/main/groovy/com/netflix/spinnaker/orca/clouddriver/utils/HealthHelper.groovy @@ -96,7 +96,7 @@ class HealthHelper { } // no health indicators is indicative of being down - boolean someAreDown = !healths || healths.any { it.state == 'Down' || it.state == 'OutOfService' } + boolean someAreDown = !healths || healths.any { it.state == 'Down' || it.state == 'OutOfService' || it.state == 'Starting' } boolean noneAreUp = !healths.any { it.state == 'Up' } return someAreDown && noneAreUp diff --git a/orca-clouddriver/src/test/groovy/com/netflix/spinnaker/orca/clouddriver/tasks/cluster/WaitForClusterDisableTaskSpec.groovy b/orca-clouddriver/src/test/groovy/com/netflix/spinnaker/orca/clouddriver/tasks/cluster/WaitForClusterDisableTaskSpec.groovy index fb6add2b85..c57d96cf6d 100644 --- a/orca-clouddriver/src/test/groovy/com/netflix/spinnaker/orca/clouddriver/tasks/cluster/WaitForClusterDisableTaskSpec.groovy +++ b/orca-clouddriver/src/test/groovy/com/netflix/spinnaker/orca/clouddriver/tasks/cluster/WaitForClusterDisableTaskSpec.groovy @@ -75,35 +75,36 @@ class WaitForClusterDisableTaskSpec extends Specification { result.getStatus() == status where: - dsgregion | minWaitTime | oldSGDisabled | desired | desiredPct | interestingHealthProviderNames | extraHealths | platformHealthState || status - "other" | 0 | false | 3 | null | ['platformHealthType'] | [] | 'Unknown' || SUCCEEDED // exercises if (!remainingDeployServerGroups)... - "other" | 90 | false | 3 | null | ['platformHealthType'] | [] | 'Unknown' || RUNNING // keeps running if duration < minWaitTime + dsgregion | minWaitTime | oldSGDisabled | desired | desiredPct | interestingHealthProviderNames | extraHealths | platformHealthState || status + "other" | 0 | false | 3 | null | ['platformHealthType'] | [] | 'Unknown' || SUCCEEDED // exercises if (!remainingDeployServerGroups)... + "other" | 90 | false | 3 | null | ['platformHealthType'] | [] | 'Unknown' || RUNNING // keeps running if duration < minWaitTime // tests for isDisabled==true - region | 0 | true | 3 | null | ['platformHealthType'] | [] | 'Unknown' || SUCCEEDED - region | 0 | true | 3 | null | ['platformHealthType'] | [] | 'NotUnknown' || RUNNING // wait for instances down even if cluster is disabled - region | 0 | true | 3 | 100 | ['platformHealthType'] | [] | 'NotUnknown' || RUNNING // also wait for instances down with a desiredPct - region | 0 | true | 4 | 50 | ['platformHealthType'] | [] | 'Unknown' || SUCCEEDED - region | 0 | true | 3 | null | ['strangeType'] | [] | 'Unknown' || SUCCEEDED // intersection of interesting and provided healths is empty, so we're done - region | 0 | true | 3 | null | ['strangeType'] | health('strange', 'Down') | 'Unknown' || SUCCEEDED // also done if we provide it and are down... - region | 0 | true | 3 | null | ['strangeType'] | health('strange', 'Up') | 'Unknown' || RUNNING // ...but not if that extra health is up + region | 0 | true | 3 | null | ['platformHealthType'] | [] | 'Unknown' || SUCCEEDED + region | 0 | true | 3 | null | ['platformHealthType'] | [] | 'NotUnknown' || RUNNING // wait for instances down even if cluster is disabled + region | 0 | true | 3 | 100 | ['platformHealthType'] | [] | 'NotUnknown' || RUNNING // also wait for instances down with a desiredPct + region | 0 | true | 4 | 50 | ['platformHealthType'] | [] | 'Unknown' || SUCCEEDED + region | 0 | true | 3 | null | ['strangeType'] | [] | 'Unknown' || SUCCEEDED // intersection of interesting and provided healths is empty, so we're done + region | 0 | true | 3 | null | ['strangeType'] | health('strange', 'Down') | 'Unknown' || SUCCEEDED // also done if we provide it and are down... + region | 0 | true | 3 | null | ['strangeType'] | health('strange', 'Up') | 'Unknown' || RUNNING // ...but not if that extra health is up // tests for isDisabled==false, no desiredPct - region | 0 | false | 3 | null | [] | [] | 'Unknown' || SUCCEEDED // no health providers to check so short-circuits early - region | 0 | false | 3 | null | null | [] | 'Unknown' || RUNNING // exercises null vs empty behavior of interestingHealthProviderNames - region | 0 | false | 3 | null | ['platformHealthType'] | [] | 'Unknown' || SUCCEEDED // considered complete because only considers the platform health - region | 0 | false | 3 | null | ['platformHealthType'] | [] | 'Up' || SUCCEEDED // considered complete because only considers the platform health, despite platform health being Up - region | 0 | false | 3 | null | ['strangeType'] | [] | 'Unknown' || RUNNING // can't complete if we need to monitor an unknown health provider... - region | 0 | false | 3 | null | ['strangeType'] | health('strange', 'Down') | 'Unknown' || RUNNING // ...regardless of down status + region | 0 | false | 3 | null | [] | [] | 'Unknown' || SUCCEEDED // no health providers to check so short-circuits early + region | 0 | false | 3 | null | null | [] | 'Unknown' || RUNNING // exercises null vs empty behavior of interestingHealthProviderNames + region | 0 | false | 3 | null | ['platformHealthType'] | [] | 'Unknown' || SUCCEEDED // considered complete because only considers the platform health + region | 0 | false | 3 | null | ['platformHealthType'] | [] | 'Up' || SUCCEEDED // considered complete because only considers the platform health, despite platform health being Up + region | 0 | false | 3 | null | ['strangeType'] | [] | 'Unknown' || RUNNING // can't complete if we need to monitor an unknown health provider... + region | 0 | false | 3 | null | ['strangeType'] | health('strange', 'Down') | 'Unknown' || RUNNING // ...regardless of down status // tests for waitForRequiredInstancesDownTask.hasSucceeded - region | 0 | false | 3 | 100 | null | [] | 'Unknown' || SUCCEEDED // no other health providers than platform, and it looks down - region | 0 | false | 3 | 100 | null | [] | 'NotUnknown' || RUNNING // no other health providers than platform, and it looks NOT down - region | 0 | false | 4 | 100 | ['platformHealthType'] | [] | 'Unknown' || RUNNING // can't reach count(someAreDownAndNoneAreUp) >= targetDesiredSize - region | 0 | false | 4 | 50 | ['platformHealthType'] | [] | 'Unknown' || SUCCEEDED // all look down, and we want at least 2 down so we're done - region | 0 | false | 3 | 100 | ['strangeType'] | [] | 'Unknown' || SUCCEEDED // intersection of interesting and provided healths is empty, so we're done - region | 0 | false | 3 | 100 | ['strangeType'] | health('strange', 'Down') | 'Unknown' || SUCCEEDED // ...unless we have data for that health provider - region | 0 | false | 3 | 100 | ['strangeType'] | health('strange', 'Up') | 'Unknown' || RUNNING // ...unless we have data for that health provider + region | 0 | false | 3 | 100 | null | [] | 'Unknown' || SUCCEEDED // no other health providers than platform, and it looks down + region | 0 | false | 3 | 100 | null | [] | 'NotUnknown' || RUNNING // no other health providers than platform, and it looks NOT down + region | 0 | false | 4 | 100 | ['platformHealthType'] | [] | 'Unknown' || RUNNING // can't reach count(someAreDownAndNoneAreUp) >= targetDesiredSize + region | 0 | false | 4 | 50 | ['platformHealthType'] | [] | 'Unknown' || SUCCEEDED // all look down, and we want at least 2 down so we're done + region | 0 | false | 3 | 100 | ['strangeType'] | [] | 'Unknown' || SUCCEEDED // intersection of interesting and provided healths is empty, so we're done + region | 0 | false | 3 | 100 | ['strangeType'] | health('strange', 'Down') | 'Unknown' || SUCCEEDED // ...unless we have data for that health provider + region | 0 | false | 3 | 100 | ['strangeType'] | health('strange', 'Up') | 'Unknown' || RUNNING // ...unless we have data for that health provider + region | 0 | false | 3 | 100 | ['strangeType'] | health('strange', 'Starting') | 'Unknown' || SUCCEEDED // Starting is considered down oldServerGroup = "v167" newServerGroup = "v168"