From af6d75a3eecdc567efb4034405a2677ed5d44f8b Mon Sep 17 00:00:00 2001
From: Andrew Ross <andrross@amazon.com>
Date: Fri, 14 Jun 2024 15:06:30 -0700
Subject: [PATCH] Fix
 AwarenessAttributeDecommissionIT.testConcurrentDecommissionAction

The problem is that this test would decommission one of six nodes. The
tear down logic of the test would attempt to assert on the health of the
cluster by randomly selecting a node and requesting the cluster health.
If this random check happened to select the node that was
decommissioned, then the test would fail. The fix is to recommission
the node at the end of the test.

Also, the "recommission node and assert cluster health" logic was used
in multiple places and could be refactored out to a helper method.

Resolves #14290
Resolves #12197

Signed-off-by: Andrew Ross <andrross@amazon.com>
---
 .../AwarenessAttributeDecommissionIT.java     | 74 +++++++------------
 1 file changed, 25 insertions(+), 49 deletions(-)

diff --git a/server/src/internalClusterTest/java/org/opensearch/cluster/coordination/AwarenessAttributeDecommissionIT.java b/server/src/internalClusterTest/java/org/opensearch/cluster/coordination/AwarenessAttributeDecommissionIT.java
index b33d57ed43189..beed6e6846b46 100644
--- a/server/src/internalClusterTest/java/org/opensearch/cluster/coordination/AwarenessAttributeDecommissionIT.java
+++ b/server/src/internalClusterTest/java/org/opensearch/cluster/coordination/AwarenessAttributeDecommissionIT.java
@@ -539,18 +539,7 @@ private void assertNodesRemovedAfterZoneDecommission(boolean originalClusterMana
             assertEquals(originalClusterManager, currentClusterManager);
         }
 
-        // Will wait for all events to complete
-        client(activeNode).admin().cluster().prepareHealth().setWaitForEvents(Priority.LANGUID).get();
-
-        // Recommissioning the zone back to gracefully succeed the test once above tests succeeds
-        DeleteDecommissionStateResponse deleteDecommissionStateResponse = client(currentClusterManager).execute(
-            DeleteDecommissionStateAction.INSTANCE,
-            new DeleteDecommissionStateRequest()
-        ).get();
-        assertTrue(deleteDecommissionStateResponse.isAcknowledged());
-
-        // will wait for cluster to stabilise with a timeout of 2 min as by then all nodes should have joined the cluster
-        ensureStableCluster(15, TimeValue.timeValueMinutes(2));
+        deleteDecommissionStateAndWaitForStableCluster(currentClusterManager, 15);
     }
 
     public void testDecommissionFailedWhenDifferentAttributeAlreadyDecommissioned() throws Exception {
@@ -617,18 +606,7 @@ public void testDecommissionFailedWhenDifferentAttributeAlreadyDecommissioned()
             )
         );
 
-        // Will wait for all events to complete
-        client(node_in_c).admin().cluster().prepareHealth().setWaitForEvents(Priority.LANGUID).get();
-
-        // Recommissioning the zone back to gracefully succeed the test once above tests succeeds
-        DeleteDecommissionStateResponse deleteDecommissionStateResponse = client(node_in_c).execute(
-            DeleteDecommissionStateAction.INSTANCE,
-            new DeleteDecommissionStateRequest()
-        ).get();
-        assertTrue(deleteDecommissionStateResponse.isAcknowledged());
-
-        // will wait for cluster to stabilise with a timeout of 2 min as by then all nodes should have joined the cluster
-        ensureStableCluster(6, TimeValue.timeValueMinutes(2));
+        deleteDecommissionStateAndWaitForStableCluster(node_in_c, 6);
     }
 
     public void testDecommissionStatusUpdatePublishedToAllNodes() throws ExecutionException, InterruptedException {
@@ -748,20 +726,7 @@ public void testDecommissionStatusUpdatePublishedToAllNodes() throws ExecutionEx
         );
         logger.info("--> Verified the decommissioned node has in_progress state.");
 
-        // Will wait for all events to complete
-        client(activeNode).admin().cluster().prepareHealth().setWaitForEvents(Priority.LANGUID).get();
-        logger.info("--> Got LANGUID event");
-        // Recommissioning the zone back to gracefully succeed the test once above tests succeeds
-        DeleteDecommissionStateResponse deleteDecommissionStateResponse = client(activeNode).execute(
-            DeleteDecommissionStateAction.INSTANCE,
-            new DeleteDecommissionStateRequest()
-        ).get();
-        assertTrue(deleteDecommissionStateResponse.isAcknowledged());
-        logger.info("--> Deleting decommission done.");
-
-        // will wait for cluster to stabilise with a timeout of 2 min (findPeerInterval for decommissioned nodes)
-        // as by then all nodes should have joined the cluster
-        ensureStableCluster(6, TimeValue.timeValueSeconds(121));
+        deleteDecommissionStateAndWaitForStableCluster(activeNode, 6);
     }
 
     public void testDecommissionFailedWhenAttributeNotWeighedAway() throws Exception {
@@ -983,15 +948,7 @@ public void testDecommissionAcknowledgedIfWeightsNotSetForNonRoutingNode() throw
         assertEquals(clusterState.nodes().getDataNodes().size(), 3);
         assertEquals(clusterState.nodes().getClusterManagerNodes().size(), 2);
 
-        // Recommissioning the zone back to gracefully succeed the test once above tests succeeds
-        DeleteDecommissionStateResponse deleteDecommissionStateResponse = client(dataNodes.get(0)).execute(
-            DeleteDecommissionStateAction.INSTANCE,
-            new DeleteDecommissionStateRequest()
-        ).get();
-        assertTrue(deleteDecommissionStateResponse.isAcknowledged());
-
-        // will wait for cluster to stabilise with a timeout of 2 min as by then all nodes should have joined the cluster
-        ensureStableCluster(6, TimeValue.timeValueMinutes(2));
+        deleteDecommissionStateAndWaitForStableCluster(dataNodes.get(0), 6);
     }
 
     public void testConcurrentDecommissionAction() throws Exception {
@@ -1019,7 +976,7 @@ public void testConcurrentDecommissionAction() throws Exception {
                 .build()
         );
         logger.info("--> start 3 data nodes on zones 'a' & 'b' & 'c'");
-        internalCluster().startNodes(
+        final String bZoneDataNode = internalCluster().startNodes(
             Settings.builder()
                 .put(commonSettings)
                 .put("node.attr.zone", "a")
@@ -1035,7 +992,7 @@ public void testConcurrentDecommissionAction() throws Exception {
                 .put("node.attr.zone", "c")
                 .put(onlyRole(commonSettings, DiscoveryNodeRole.DATA_ROLE))
                 .build()
-        );
+        ).get(1);
 
         ensureStableCluster(6);
         ClusterHealthResponse health = client().admin()
@@ -1100,6 +1057,25 @@ public void testConcurrentDecommissionAction() throws Exception {
         assertEquals(concurrentRuns, numRequestAcknowledged.get() + numRequestUnAcknowledged.get() + numRequestFailed.get());
         assertEquals(concurrentRuns - 1, numRequestFailed.get());
         assertEquals(1, numRequestAcknowledged.get() + numRequestUnAcknowledged.get());
+
+        deleteDecommissionStateAndWaitForStableCluster(bZoneDataNode, 6);
+    }
+
+    private void deleteDecommissionStateAndWaitForStableCluster(String activeNodeName, int expectedClusterSize) throws ExecutionException,
+        InterruptedException {
+        client(activeNodeName).admin().cluster().prepareHealth().setWaitForEvents(Priority.LANGUID).get();
+
+        // Recommissioning the zone back to gracefully succeed the test once above tests succeeds
+        DeleteDecommissionStateResponse deleteDecommissionStateResponse = client(activeNodeName).execute(
+            DeleteDecommissionStateAction.INSTANCE,
+            new DeleteDecommissionStateRequest()
+        ).get();
+        assertTrue(deleteDecommissionStateResponse.isAcknowledged());
+        logger.info("--> Deleting decommission done.");
+
+        // will wait for cluster to stabilise with a timeout of 2 min (findPeerInterval for decommissioned nodes)
+        // as by then all nodes should have joined the cluster
+        ensureStableCluster(expectedClusterSize, TimeValue.timeValueSeconds(121));
     }
 
     private static class WaitForFailedDecommissionState implements ClusterStateObserver.Listener {