opensearch-project · dblock · Oct 16, 2024 · Sep 29, 2024 · Oct 8, 2024 · Oct 10, 2024
@@ -83,6 +83,8 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 - Fix protobuf-java leak through client library dependencies ([#16254](https://github.com/opensearch-project/OpenSearch/pull/16254))
 - Fix multi-search with template doesn't return status code ([#16265](https://github.com/opensearch-project/OpenSearch/pull/16265))
 - Fix wrong default value when setting `index.number_of_routing_shards` to null on index creation ([#16331](https://github.com/opensearch-project/OpenSearch/pull/16331))
+- Fix disk usage exceeds threshold cluster can't spin up issue ([#15258](https://github.com/opensearch-project/OpenSearch/pull/15258)))
+
 
 ### Security
 

@@ -31,13 +31,15 @@
 
 package org.opensearch.bootstrap;
 
+import org.opensearch.common.logging.LogConfigurator;
 import org.opensearch.common.settings.KeyStoreCommandTestCase;
 import org.opensearch.common.settings.KeyStoreWrapper;
 import org.opensearch.common.settings.SecureSettings;
 import org.opensearch.common.settings.Settings;
 import org.opensearch.common.util.io.IOUtils;
 import org.opensearch.core.common.settings.SecureString;
 import org.opensearch.env.Environment;
+import org.opensearch.node.Node;
 import org.opensearch.test.OpenSearchTestCase;
 import org.junit.After;
 import org.junit.Before;
@@ -51,8 +53,14 @@
 import java.nio.file.Path;
 import java.util.ArrayList;
 import java.util.List;
+import java.util.concurrent.CountDownLatch;
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.atomic.AtomicInteger;
 
 import static org.hamcrest.Matchers.equalTo;
+import static org.mockito.Mockito.doAnswer;
+import static org.mockito.Mockito.mock;
+import static org.mockito.Mockito.verify;
 
 public class BootstrapTests extends OpenSearchTestCase {
     Environment env;
@@ -131,4 +139,38 @@ private void assertPassphraseRead(String source, String expected) {
         }
     }
 
+    public void testInitExecutionOrder() throws Exception {
+        AtomicInteger order = new AtomicInteger(0);
+        CountDownLatch countDownLatch = new CountDownLatch(1);
+        Thread mockThread = new Thread(() -> {
+            assertEquals(0, order.getAndIncrement());
+            countDownLatch.countDown();
+        });
+
+        Node mockNode = mock(Node.class);
+        doAnswer(invocation -> {
+            try {
+                boolean threadStarted = countDownLatch.await(1000, TimeUnit.MILLISECONDS);
+                assertTrue(
+                    "Waited for one second but the keepAliveThread isn't started, please check the execution order of"
+                        + "keepAliveThread.start and node.start",
+                    threadStarted
+                );
+            } catch (InterruptedException e) {
+                fail("Thread interrupted");
+            }
+            assertEquals(1, order.getAndIncrement());
+            return null;
+        }).when(mockNode).start();
+
+        LogConfigurator.registerErrorListener();
+        Bootstrap testBootstrap = new Bootstrap(mockThread, mockNode);
+        Bootstrap.setInstance(testBootstrap);
+
+        Bootstrap.startInstance(testBootstrap);
+
+        verify(mockNode).start();
+        assertEquals(2, order.get());
+    }
+
 }
@@ -93,6 +93,17 @@
     private final Thread keepAliveThread;
     private final Spawner spawner = new Spawner();
 
+    // For testing purpose
+    static void setInstance(Bootstrap bootstrap) {
+        INSTANCE = bootstrap;
+    }
+
+    // For testing purpose
+    Bootstrap(Thread keepAliveThread, Node node) {
+        this.keepAliveThread = keepAliveThread;
+        this.node = node;
+    }
+
     /** creates a new instance */
     Bootstrap() {
         keepAliveThread = new Thread(new Runnable() {
@@ -336,8 +347,10 @@
     }
 
     private void start() throws NodeValidationException {
-        node.start();
+        // keepAliveThread should start first than node to ensure the cluster can spin up successfully in edge cases:
+        // https://github.com/opensearch-project/OpenSearch/issues/14791
         keepAliveThread.start();
+        node.start();
     }
 
     static void stop() throws IOException {
@@ -410,7 +423,7 @@
                 throw new BootstrapException(e);
             }
 
-            INSTANCE.start();
+            startInstance(INSTANCE);
 
             // We don't close stderr if `--quiet` is passed, because that
             // hides fatal startup errors. For example, if OpenSearch is
@@ -462,6 +475,10 @@
         }
     }
 
+    static void startInstance(Bootstrap instance) throws NodeValidationException {
+        instance.start();
+    }
+
     @SuppressForbidden(reason = "System#out")
     private static void closeSystOut() {
         System.out.close();