Skip to content

Commit

Permalink
fix cluster not able to spin up issue when disk usage exceeds thresho…
Browse files Browse the repository at this point in the history
…ld (opensearch-project#15258)

* fix cluster not able to spin up issue when disk usage exceeds threshold

Signed-off-by: zane-neo <zaniu@amazon.com>

* Add comment to changes

Signed-off-by: zane-neo <zaniu@amazon.com>

* Add UT to ensure the keepAliveThread starts before node starts

Signed-off-by: zane-neo <zaniu@amazon.com>

* remove unused imports

Signed-off-by: zane-neo <zaniu@amazon.com>

* Fix forbidden API calls check failed issue

Signed-off-by: zane-neo <zaniu@amazon.com>

* format code

Signed-off-by: zane-neo <zaniu@amazon.com>

* format code

Signed-off-by: zane-neo <zaniu@amazon.com>

* change setInstance method to static

Signed-off-by: zane-neo <zaniu@amazon.com>

* Add countdownlatch in test to coordinate the thread to avoid concureency issue caused test failure

Signed-off-by: zane-neo <zaniu@amazon.com>

---------

Signed-off-by: zane-neo <zaniu@amazon.com>
  • Loading branch information
zane-neo authored and dk2k committed Oct 21, 2024
1 parent 07b0da3 commit a7addd8
Show file tree
Hide file tree
Showing 3 changed files with 63 additions and 2 deletions.
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,8 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
- Fix protobuf-java leak through client library dependencies ([#16254](https://github.com/opensearch-project/OpenSearch/pull/16254))
- Fix multi-search with template doesn't return status code ([#16265](https://github.com/opensearch-project/OpenSearch/pull/16265))
- Fix wrong default value when setting `index.number_of_routing_shards` to null on index creation ([#16331](https://github.com/opensearch-project/OpenSearch/pull/16331))
- Fix disk usage exceeds threshold cluster can't spin up issue ([#15258](https://github.com/opensearch-project/OpenSearch/pull/15258)))


### Security

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,13 +31,15 @@

package org.opensearch.bootstrap;

import org.opensearch.common.logging.LogConfigurator;
import org.opensearch.common.settings.KeyStoreCommandTestCase;
import org.opensearch.common.settings.KeyStoreWrapper;
import org.opensearch.common.settings.SecureSettings;
import org.opensearch.common.settings.Settings;
import org.opensearch.common.util.io.IOUtils;
import org.opensearch.core.common.settings.SecureString;
import org.opensearch.env.Environment;
import org.opensearch.node.Node;
import org.opensearch.test.OpenSearchTestCase;
import org.junit.After;
import org.junit.Before;
Expand All @@ -51,8 +53,14 @@
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.List;
import java.util.concurrent.CountDownLatch;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicInteger;

import static org.hamcrest.Matchers.equalTo;
import static org.mockito.Mockito.doAnswer;
import static org.mockito.Mockito.mock;
import static org.mockito.Mockito.verify;

public class BootstrapTests extends OpenSearchTestCase {
Environment env;
Expand Down Expand Up @@ -131,4 +139,38 @@ private void assertPassphraseRead(String source, String expected) {
}
}

public void testInitExecutionOrder() throws Exception {
AtomicInteger order = new AtomicInteger(0);
CountDownLatch countDownLatch = new CountDownLatch(1);
Thread mockThread = new Thread(() -> {
assertEquals(0, order.getAndIncrement());
countDownLatch.countDown();
});

Node mockNode = mock(Node.class);
doAnswer(invocation -> {
try {
boolean threadStarted = countDownLatch.await(1000, TimeUnit.MILLISECONDS);
assertTrue(
"Waited for one second but the keepAliveThread isn't started, please check the execution order of"
+ "keepAliveThread.start and node.start",
threadStarted
);
} catch (InterruptedException e) {
fail("Thread interrupted");
}
assertEquals(1, order.getAndIncrement());
return null;
}).when(mockNode).start();

LogConfigurator.registerErrorListener();
Bootstrap testBootstrap = new Bootstrap(mockThread, mockNode);
Bootstrap.setInstance(testBootstrap);

Bootstrap.startInstance(testBootstrap);

verify(mockNode).start();
assertEquals(2, order.get());
}

}
21 changes: 19 additions & 2 deletions server/src/main/java/org/opensearch/bootstrap/Bootstrap.java
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,17 @@ final class Bootstrap {
private final Thread keepAliveThread;
private final Spawner spawner = new Spawner();

// For testing purpose
static void setInstance(Bootstrap bootstrap) {
INSTANCE = bootstrap;
}

// For testing purpose
Bootstrap(Thread keepAliveThread, Node node) {
this.keepAliveThread = keepAliveThread;
this.node = node;
}

/** creates a new instance */
Bootstrap() {
keepAliveThread = new Thread(new Runnable() {
Expand Down Expand Up @@ -336,8 +347,10 @@ private static Environment createEnvironment(
}

private void start() throws NodeValidationException {
node.start();
// keepAliveThread should start first than node to ensure the cluster can spin up successfully in edge cases:
// https://github.com/opensearch-project/OpenSearch/issues/14791
keepAliveThread.start();
node.start();
}

static void stop() throws IOException {
Expand Down Expand Up @@ -410,7 +423,7 @@ static void init(final boolean foreground, final Path pidFile, final boolean qui
throw new BootstrapException(e);
}

INSTANCE.start();
startInstance(INSTANCE);

// We don't close stderr if `--quiet` is passed, because that
// hides fatal startup errors. For example, if OpenSearch is
Expand Down Expand Up @@ -462,6 +475,10 @@ static void init(final boolean foreground, final Path pidFile, final boolean qui
}
}

static void startInstance(Bootstrap instance) throws NodeValidationException {
instance.start();
}

@SuppressForbidden(reason = "System#out")
private static void closeSystOut() {
System.out.close();
Expand Down

0 comments on commit a7addd8

Please sign in to comment.