feat(clouddriver): Favor target capacity when current desired == 0 (#…

…2746) This handles a situation where we appear to be getting an occasional stale min/max/desired when an asg id is re-used across deployments. ie. app-v000 was deleted and re-created Also reaching out to AWS for clarification as to whether this is expected or not.
spinnaker · Mar 13, 2019 · 7821834 · 7821834
1 parent e05047c
commit 7821834
Show file tree

Hide file tree

Showing 2 changed files with 39 additions and 5 deletions.
diff --git a/...roovy/com/netflix/spinnaker/orca/clouddriver/tasks/instance/WaitForUpInstancesTask.groovy b/...roovy/com/netflix/spinnaker/orca/clouddriver/tasks/instance/WaitForUpInstancesTask.groovy
@@ -20,8 +20,11 @@ import com.netflix.spinnaker.orca.clouddriver.utils.HealthHelper
 import com.netflix.spinnaker.orca.clouddriver.utils.HealthHelper.HealthCountSnapshot
 import com.netflix.spinnaker.orca.pipeline.model.Stage
 import groovy.util.logging.Slf4j
+import org.slf4j.MDC
 import org.springframework.stereotype.Component
 
+import java.util.concurrent.TimeUnit
+
 @Component
 @Slf4j
 class WaitForUpInstancesTask extends AbstractWaitingForInstancesTask {
@@ -182,6 +185,20 @@ class WaitForUpInstancesTask extends AbstractWaitingForInstancesTask {
 
     def cloudProvider = stage.context.cloudProvider
 
+    Optional<String> taskStartTime = Optional.ofNullable(MDC.get("taskStartTime"));
+    if (taskStartTime.isPresent()) {
+      if (System.currentTimeMillis() - TimeUnit.MINUTES.toMillis(10) > Long.valueOf(taskStartTime.get())) {
+        // expectation is reconciliation has happened within 10 minutes and that the
+        // current server group capacity should be preferred
+        log.error(
+            "Short circuiting initial target capacity determination after 10 minutes (serverGroup: {}, executionId: {})",
+            "${cloudProvider}:${serverGroup.region}:${serverGroup.name}",
+            stage.execution.id
+        )
+        return serverGroupCapacity
+      }
+    }
+
     def initialTargetCapacity = getInitialTargetCapacity(stage, serverGroup)
     if (!initialTargetCapacity) {
       log.debug(
@@ -192,7 +209,8 @@ class WaitForUpInstancesTask extends AbstractWaitingForInstancesTask {
       return serverGroupCapacity
     }
 
-    if (serverGroup.capacity.max == 0 && initialTargetCapacity.max != 0) {
+    if ((serverGroup.capacity.max == 0 && initialTargetCapacity.max != 0) ||
+        (serverGroup.capacity.desired == 0 && initialTargetCapacity.desired > 0)) {
       log.info(
           "Overriding server group capacity (serverGroup: {}, initialTargetCapacity: {}, executionId: {})",
           "${cloudProvider}:${serverGroup.region}:${serverGroup.name}",

diff --git a/...y/com/netflix/spinnaker/orca/clouddriver/tasks/instance/WaitForUpInstancesTaskSpec.groovy b/...y/com/netflix/spinnaker/orca/clouddriver/tasks/instance/WaitForUpInstancesTaskSpec.groovy
@@ -21,12 +21,15 @@ import com.netflix.spinnaker.orca.clouddriver.OortService
 import com.netflix.spinnaker.orca.jackson.OrcaObjectMapper
 import com.netflix.spinnaker.orca.pipeline.model.Execution
 import com.netflix.spinnaker.orca.pipeline.model.Stage
+import org.slf4j.MDC
 import retrofit.client.Response
 import retrofit.mime.TypedString
 import spock.lang.Specification
 import spock.lang.Subject
 import spock.lang.Unroll
 
+import java.util.concurrent.TimeUnit
+
 import static com.netflix.spinnaker.orca.test.model.ExecutionBuilder.stage
 
 class WaitForUpInstancesTaskSpec extends Specification {
@@ -40,6 +43,10 @@ class WaitForUpInstancesTaskSpec extends Specification {
 
   def mapper = OrcaObjectMapper.newInstance()
 
+  void cleanup() {
+    MDC.clear()
+  }
+
   void "should check cluster to get server groups"() {
     given:
     def pipeline = Execution.newPipeline("orca")
@@ -504,19 +511,28 @@ class WaitForUpInstancesTaskSpec extends Specification {
 
     def serverGroup = [name: "app-v001", region: "us-west-2", capacity: serverGroupCapacity]
 
+    and:
+    MDC.put("taskStartTime", taskStartTime.toString())
+
     expect:
     WaitForUpInstancesTask.getServerGroupCapacity(stage, serverGroup) == expectedServerGroupCapacity
 
     where:
-    katoTasks                                                                          | serverGroupCapacity          || expectedServerGroupCapacity
-    null                                                                               | [min: 0, max: 0, desired: 0] || [min: 0, max: 0, desired: 0]
-    [[resultObjects: [[deployments: [deployment("app-v001", "us-west-2", 0, 1, 1)]]]]] | [min: 0, max: 0, desired: 0] || [min: 0, max: 1, desired: 1]   // should take initial capacity b/c max = 0
-    [[resultObjects: [[deployments: [deployment("app-v001", "us-west-2", 0, 1, 1)]]]]] | [min: 0, max: 2, desired: 2] || [min: 0, max: 2, desired: 2]   // should take current capacity b/c max > 0
+    katoTasks                                                                          | taskStartTime | serverGroupCapacity            || expectedServerGroupCapacity
+    null                                                                               | startTime(0)  | [min: 0, max: 0, desired: 0]   || [min: 0, max: 0, desired: 0]
+    [[resultObjects: [[deployments: [deployment("app-v001", "us-west-2", 0, 1, 1)]]]]] | startTime(9)  | [min: 0, max: 0, desired: 0]   || [min: 0, max: 1, desired: 1]   // should take initial capacity b/c max = 0
+    [[resultObjects: [[deployments: [deployment("app-v001", "us-west-2", 0, 1, 1)]]]]] | startTime(9)  | [min: 0, max: 400, desired: 0] || [min: 0, max: 1, desired: 1]   // should take initial capacity b/c desired = 0
+    [[resultObjects: [[deployments: [deployment("app-v001", "us-west-2", 0, 1, 1)]]]]] | startTime(9)  | [min: 0, max: 2, desired: 2]   || [min: 0, max: 2, desired: 2]   // should take current capacity b/c max > 0
+    [[resultObjects: [[deployments: [deployment("app-v001", "us-west-2", 0, 1, 1)]]]]] | startTime(11) | [min: 0, max: 0, desired: 0]   || [min: 0, max: 0, desired: 0]   // should take current capacity b/c timeout
   }
 
   static Map deployment(String serverGroupName, String location, int min, int max, int desired) {
     return [
         serverGroupName: serverGroupName, location: location, capacity: [min: min, max: max, desired: desired]
     ]
   }
+
+  static Long startTime(int minutesOld) {
+    return System.currentTimeMillis() - TimeUnit.MINUTES.toMillis(minutesOld)
+  }
 }