From 0fc8d07ea4bc48dd82cbdebddf7fd3e9f234997a Mon Sep 17 00:00:00 2001 From: Gordon Brown Date: Wed, 18 Aug 2021 10:37:51 -0600 Subject: [PATCH] Fix `NOT_STARTED` statuses appearing inappropirately during node shutdown (#75750) (#76634) * Fix `NOT_STARTED` statuses appearing inappropirately during node shutdown (#75750) This PR fixes two situations where `NOT_STARTED` can appear as the shard migration status inappropriately: 1. When the node is actually shut down after having all the shards migrate away. 2. When a non-data-node is registered for shutdown. It also adds tests to ensure these cases are handled correctly. * Fix compilation for backport * Fix compilation for backport --- .../metadata/SingleNodeShutdownMetadata.java | 37 ++++- .../metadata/NodesShutdownMetadataTests.java | 3 +- .../xpack/shutdown/NodeShutdownShardsIT.java | 147 ++++++++++++++++++ .../xpack/shutdown/NodeSeenService.java | 99 ++++++++++++ .../xpack/shutdown/ShutdownPlugin.java | 31 ++++ .../TransportGetShutdownStatusAction.java | 25 ++- .../TransportPutShutdownNodeAction.java | 3 + ...TransportGetShutdownStatusActionTests.java | 40 +++-- 8 files changed, 357 insertions(+), 28 deletions(-) create mode 100644 x-pack/plugin/shutdown/src/internalClusterTest/java/org/elasticsearch/xpack/shutdown/NodeShutdownShardsIT.java create mode 100644 x-pack/plugin/shutdown/src/main/java/org/elasticsearch/xpack/shutdown/NodeSeenService.java diff --git a/server/src/main/java/org/elasticsearch/cluster/metadata/SingleNodeShutdownMetadata.java b/server/src/main/java/org/elasticsearch/cluster/metadata/SingleNodeShutdownMetadata.java index 200878577b411..27789d27fd9c7 100644 --- a/server/src/main/java/org/elasticsearch/cluster/metadata/SingleNodeShutdownMetadata.java +++ b/server/src/main/java/org/elasticsearch/cluster/metadata/SingleNodeShutdownMetadata.java @@ -39,6 +39,7 @@ public class SingleNodeShutdownMetadata extends AbstractDiffable PARSER = new ConstructingObjectParser<>( "node_shutdown_info", @@ -47,7 +48,8 @@ public class SingleNodeShutdownMetadata extends AbstractDiffable TimeValue.parseTimeValue(p.textOrNull(), ALLOCATION_DELAY_FIELD.getPreferredName()), ALLOCATION_DELAY_FIELD, @@ -73,6 +76,7 @@ public static SingleNodeShutdownMetadata parse(XContentParser parser) { private final Type type; private final String reason; private final long startedAtMillis; + private final boolean nodeSeen; @Nullable private final TimeValue allocationDelay; /** @@ -86,12 +90,14 @@ private SingleNodeShutdownMetadata( Type type, String reason, long startedAtMillis, + boolean nodeSeen, @Nullable TimeValue allocationDelay ) { this.nodeId = Objects.requireNonNull(nodeId, "node ID must not be null"); this.type = Objects.requireNonNull(type, "shutdown type must not be null"); this.reason = Objects.requireNonNull(reason, "shutdown reason must not be null"); this.startedAtMillis = startedAtMillis; + this.nodeSeen = nodeSeen; if (allocationDelay != null && Type.RESTART.equals(type) == false) { throw new IllegalArgumentException("shard allocation delay is only valid for RESTART-type shutdowns"); } @@ -103,6 +109,7 @@ public SingleNodeShutdownMetadata(StreamInput in) throws IOException { this.type = in.readEnum(Type.class); this.reason = in.readString(); this.startedAtMillis = in.readVLong(); + this.nodeSeen = in.readBoolean(); this.allocationDelay = in.readOptionalTimeValue(); } @@ -134,6 +141,13 @@ public long getStartedAtMillis() { return startedAtMillis; } + /** + * @return A boolean indicated whether this node has been seen in the cluster since the shutdown was registered. + */ + public boolean getNodeSeen() { + return nodeSeen; + } + /** * @return The amount of time shard reallocation should be delayed for shards on this node, so that they will not be automatically * reassigned while the node is restarting. Will be {@code null} for non-restart shutdowns. @@ -154,6 +168,7 @@ public void writeTo(StreamOutput out) throws IOException { out.writeEnum(type); out.writeString(reason); out.writeVLong(startedAtMillis); + out.writeBoolean(nodeSeen); out.writeOptionalTimeValue(allocationDelay); } @@ -165,6 +180,7 @@ public XContentBuilder toXContent(XContentBuilder builder, Params params) throws builder.field(TYPE_FIELD.getPreferredName(), type); builder.field(REASON_FIELD.getPreferredName(), reason); builder.timeField(STARTED_AT_MILLIS_FIELD.getPreferredName(), STARTED_AT_READABLE_FIELD, startedAtMillis); + builder.field(NODE_SEEN_FIELD.getPreferredName(), nodeSeen); if (allocationDelay != null) { builder.field(ALLOCATION_DELAY_FIELD.getPreferredName(), allocationDelay.getStringRep()); } @@ -183,6 +199,7 @@ public boolean equals(Object o) { && getNodeId().equals(that.getNodeId()) && getType() == that.getType() && getReason().equals(that.getReason()) + && getNodeSeen() == that.getNodeSeen() && Objects.equals(allocationDelay, that.allocationDelay); } @@ -193,6 +210,7 @@ public int hashCode() { getType(), getReason(), getStartedAtMillis(), + getNodeSeen(), allocationDelay ); } @@ -209,7 +227,8 @@ public static Builder builder(SingleNodeShutdownMetadata original) { .setNodeId(original.getNodeId()) .setType(original.getType()) .setReason(original.getReason()) - .setStartedAtMillis(original.getStartedAtMillis()); + .setStartedAtMillis(original.getStartedAtMillis()) + .setNodeSeen(original.getNodeSeen()); } public static class Builder { @@ -217,6 +236,7 @@ public static class Builder { private Type type; private String reason; private long startedAtMillis = -1; + private boolean nodeSeen = false; private TimeValue allocationDelay; private Builder() {} @@ -257,6 +277,15 @@ public Builder setStartedAtMillis(long startedAtMillis) { return this; } + /** + * @param nodeSeen Whether or not the node has been seen since the shutdown was registered. + * @return This builder. + */ + public Builder setNodeSeen(boolean nodeSeen) { + this.nodeSeen = nodeSeen; + return this; + } + /** * @param allocationDelay The amount of time shard reallocation should be delayed while this node is offline. * @return This builder. @@ -275,7 +304,9 @@ public SingleNodeShutdownMetadata build() { nodeId, type, reason, - startedAtMillis, allocationDelay + startedAtMillis, + nodeSeen, + allocationDelay ); } } diff --git a/server/src/test/java/org/elasticsearch/cluster/metadata/NodesShutdownMetadataTests.java b/server/src/test/java/org/elasticsearch/cluster/metadata/NodesShutdownMetadataTests.java index cfa3909c4db5b..0b22403198512 100644 --- a/server/src/test/java/org/elasticsearch/cluster/metadata/NodesShutdownMetadataTests.java +++ b/server/src/test/java/org/elasticsearch/cluster/metadata/NodesShutdownMetadataTests.java @@ -88,7 +88,8 @@ private SingleNodeShutdownMetadata randomNodeShutdownInfo() { if (type.equals(SingleNodeShutdownMetadata.Type.RESTART) && randomBoolean()) { builder.setAllocationDelay(TimeValue.parseTimeValue(randomTimeValue(), this.getTestName())); } - return builder.build(); + return builder.setNodeSeen(randomBoolean()) + .build(); } @Override diff --git a/x-pack/plugin/shutdown/src/internalClusterTest/java/org/elasticsearch/xpack/shutdown/NodeShutdownShardsIT.java b/x-pack/plugin/shutdown/src/internalClusterTest/java/org/elasticsearch/xpack/shutdown/NodeShutdownShardsIT.java new file mode 100644 index 0000000000000..98564312e6bc0 --- /dev/null +++ b/x-pack/plugin/shutdown/src/internalClusterTest/java/org/elasticsearch/xpack/shutdown/NodeShutdownShardsIT.java @@ -0,0 +1,147 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +package org.elasticsearch.xpack.shutdown; + +import org.elasticsearch.Build; +import org.elasticsearch.action.admin.cluster.node.info.NodeInfo; +import org.elasticsearch.action.admin.cluster.node.info.NodesInfoResponse; +import org.elasticsearch.action.support.master.AcknowledgedResponse; +import org.elasticsearch.cluster.metadata.SingleNodeShutdownMetadata; +import org.elasticsearch.cluster.node.DiscoveryNode; +import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.plugins.Plugin; +import org.elasticsearch.test.ESIntegTestCase; +import org.elasticsearch.test.InternalTestCluster; + +import java.util.Arrays; +import java.util.Collection; + +import static org.elasticsearch.cluster.metadata.SingleNodeShutdownMetadata.Status.COMPLETE; +import static org.hamcrest.Matchers.equalTo; + +@ESIntegTestCase.ClusterScope(scope = ESIntegTestCase.Scope.TEST, numDataNodes = 0, numClientNodes = 0, transportClientRatio = 0) +public class NodeShutdownShardsIT extends ESIntegTestCase { + + @Override + protected Collection> nodePlugins() { + return Arrays.asList(ShutdownPlugin.class); + } + + /** + * Verifies that a node that's removed from the cluster with zero shards stays in the `COMPLETE` status after it leaves, rather than + * reverting to `NOT_STARTED` (this was a bug in the initial implementation). + */ + public void testShardStatusStaysCompleteAfterNodeLeaves() throws Exception { + assumeTrue("must be on a snapshot build of ES to run in order for the feature flag to be set", Build.CURRENT.isSnapshot()); + final String nodeToRestartName = internalCluster().startNode(); + final String nodeToRestartId = getNodeId(nodeToRestartName); + internalCluster().startNode(); + + // Mark the node for shutdown + PutShutdownNodeAction.Request putShutdownRequest = new PutShutdownNodeAction.Request( + nodeToRestartId, + SingleNodeShutdownMetadata.Type.REMOVE, + this.getTestName(), + null + ); + AcknowledgedResponse putShutdownResponse = client().execute(PutShutdownNodeAction.INSTANCE, putShutdownRequest).get(); + assertTrue(putShutdownResponse.isAcknowledged()); + + internalCluster().stopNode(nodeToRestartName); + + NodesInfoResponse nodes = client().admin().cluster().prepareNodesInfo().clear().get(); + assertThat(nodes.getNodes().size(), equalTo(1)); + + GetShutdownStatusAction.Response getResp = client().execute( + GetShutdownStatusAction.INSTANCE, + new GetShutdownStatusAction.Request(nodeToRestartId) + ).get(); + + assertThat(getResp.getShutdownStatuses().get(0).migrationStatus().getStatus(), equalTo(COMPLETE)); + } + + /** + * Similar to the previous test, but ensures that the status stays at `COMPLETE` when the node is offline when the shutdown is + * registered. This may happen if {@link NodeSeenService} isn't working as expected. + */ + public void testShardStatusStaysCompleteAfterNodeLeavesIfRegisteredWhileNodeOffline() throws Exception { + assumeTrue("must be on a snapshot build of ES to run in order for the feature flag to be set", Build.CURRENT.isSnapshot()); + final String nodeToRestartName = internalCluster().startNode(); + final String nodeToRestartId = getNodeId(nodeToRestartName); + internalCluster().startNode(); + + // Stop the node we're going to shut down and mark it as shutting down while it's offline. This checks that the cluster state + // listener is working correctly. + internalCluster().restartNode(nodeToRestartName, new InternalTestCluster.RestartCallback() { + @Override + public Settings onNodeStopped(String nodeName) throws Exception { + PutShutdownNodeAction.Request putShutdownRequest = new PutShutdownNodeAction.Request( + nodeToRestartId, + SingleNodeShutdownMetadata.Type.REMOVE, + "testShardStatusStaysCompleteAfterNodeLeavesIfRegisteredWhileNodeOffline", + null + ); + AcknowledgedResponse putShutdownResponse = client().execute(PutShutdownNodeAction.INSTANCE, putShutdownRequest).get(); + assertTrue(putShutdownResponse.isAcknowledged()); + + return super.onNodeStopped(nodeName); + } + }); + + internalCluster().stopNode(nodeToRestartName); + + NodesInfoResponse nodes = client().admin().cluster().prepareNodesInfo().clear().get(); + assertThat(nodes.getNodes().size(), equalTo(1)); + + GetShutdownStatusAction.Response getResp = client().execute( + GetShutdownStatusAction.INSTANCE, + new GetShutdownStatusAction.Request(nodeToRestartId) + ).get(); + + assertThat(getResp.getShutdownStatuses().get(0).migrationStatus().getStatus(), equalTo(COMPLETE)); + } + + /** + * Checks that non-data nodes that are registered for shutdown have a shard migration status of `COMPLETE` rather than `NOT_STARTED`. + * (this was a bug in the initial implementation). + */ + public void testShardStatusIsCompleteOnNonDataNodes() throws Exception { + assumeTrue("must be on a snapshot build of ES to run in order for the feature flag to be set", Build.CURRENT.isSnapshot()); + final String nodeToShutDownName = internalCluster().startMasterOnlyNode(); + internalCluster().startMasterOnlyNode(); // Just to have at least one other node + final String nodeToRestartId = getNodeId(nodeToShutDownName); + + // Mark the node for shutdown + PutShutdownNodeAction.Request putShutdownRequest = new PutShutdownNodeAction.Request( + nodeToRestartId, + SingleNodeShutdownMetadata.Type.REMOVE, + this.getTestName(), + null + ); + AcknowledgedResponse putShutdownResponse = client().execute(PutShutdownNodeAction.INSTANCE, putShutdownRequest).get(); + assertTrue(putShutdownResponse.isAcknowledged()); + + GetShutdownStatusAction.Response getResp = client().execute( + GetShutdownStatusAction.INSTANCE, + new GetShutdownStatusAction.Request(nodeToRestartId) + ).get(); + + assertThat(getResp.getShutdownStatuses().get(0).migrationStatus().getStatus(), equalTo(COMPLETE)); + } + + private String getNodeId(String nodeName) throws Exception { + NodesInfoResponse nodes = client().admin().cluster().prepareNodesInfo().clear().get(); + return nodes.getNodes() + .stream() + .map(NodeInfo::getNode) + .filter(node -> node.getName().equals(nodeName)) + .map(DiscoveryNode::getId) + .findFirst() + .orElseThrow(() -> new AssertionError("requested node name [" + nodeName + "] not found")); + } +} diff --git a/x-pack/plugin/shutdown/src/main/java/org/elasticsearch/xpack/shutdown/NodeSeenService.java b/x-pack/plugin/shutdown/src/main/java/org/elasticsearch/xpack/shutdown/NodeSeenService.java new file mode 100644 index 0000000000000..3bf3a846a8a27 --- /dev/null +++ b/x-pack/plugin/shutdown/src/main/java/org/elasticsearch/xpack/shutdown/NodeSeenService.java @@ -0,0 +1,99 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +package org.elasticsearch.xpack.shutdown; + +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; +import org.apache.logging.log4j.message.ParameterizedMessage; +import org.elasticsearch.cluster.ClusterChangedEvent; +import org.elasticsearch.cluster.ClusterState; +import org.elasticsearch.cluster.ClusterStateListener; +import org.elasticsearch.cluster.ClusterStateUpdateTask; +import org.elasticsearch.cluster.metadata.Metadata; +import org.elasticsearch.cluster.metadata.NodesShutdownMetadata; +import org.elasticsearch.cluster.metadata.SingleNodeShutdownMetadata; +import org.elasticsearch.cluster.service.ClusterService; + +import java.util.Map; +import java.util.Set; +import java.util.function.Function; +import java.util.stream.Collectors; + +/** + * A class that handles ongoing reactive logic related to Node Shutdown. + * + * Currently, this consists of keeping track of whether we've seen nodes which are marked for shutdown. + */ +public class NodeSeenService implements ClusterStateListener { + private static final Logger logger = LogManager.getLogger(NodeSeenService.class); + + final ClusterService clusterService; + + public NodeSeenService(ClusterService clusterService) { + this.clusterService = clusterService; + clusterService.addListener(this); + } + + @Override + public void clusterChanged(ClusterChangedEvent event) { + if (event.state().nodes().isLocalNodeElectedMaster() == false) { + // Only do this if we're the current master node. + return; + } + + if (event.nodesAdded() == false) { + // If there's no new nodes this cluster state update, nothing to do. + return; + } + + NodesShutdownMetadata eventShutdownMetadata = event.state().metadata().custom(NodesShutdownMetadata.TYPE); + final Set nodesNotPreviouslySeen = eventShutdownMetadata.getAllNodeMetadataMap() + .values() + .stream() + .filter(singleNodeShutdownMetadata -> singleNodeShutdownMetadata.getNodeSeen() == false) + .map(SingleNodeShutdownMetadata::getNodeId) + .filter(nodeId -> event.state().nodes().nodeExists(nodeId)) + .collect(Collectors.toSet()); + + if (nodesNotPreviouslySeen.isEmpty() == false) { + clusterService.submitStateUpdateTask("shutdown-seen-nodes-updater", new ClusterStateUpdateTask() { + @Override + public ClusterState execute(ClusterState currentState) throws Exception { + NodesShutdownMetadata currentShutdownMetadata = currentState.metadata().custom(NodesShutdownMetadata.TYPE); + + final Map newShutdownMetadataMap = currentShutdownMetadata.getAllNodeMetadataMap() + .values() + .stream() + .map(singleNodeShutdownMetadata -> { + if (nodesNotPreviouslySeen.contains(singleNodeShutdownMetadata.getNodeId()) + || currentState.nodes().nodeExists(singleNodeShutdownMetadata.getNodeId())) { + return SingleNodeShutdownMetadata.builder(singleNodeShutdownMetadata).setNodeSeen(true).build(); + } + return singleNodeShutdownMetadata; + }) + .collect(Collectors.toMap(SingleNodeShutdownMetadata::getNodeId, Function.identity())); + + final NodesShutdownMetadata newNodesMetadata = new NodesShutdownMetadata(newShutdownMetadataMap); + if (newNodesMetadata.equals(currentShutdownMetadata)) { + // Turns out the update was a no-op + return currentState; + } + + return ClusterState.builder(currentState) + .metadata(Metadata.builder(currentState.metadata()).putCustom(NodesShutdownMetadata.TYPE, newNodesMetadata).build()) + .build(); + } + + @Override + public void onFailure(String source, Exception e) { + logger.warn(new ParameterizedMessage("failed to mark shutting down nodes as seen: {}", nodesNotPreviouslySeen), e); + } + }); + } + } +} diff --git a/x-pack/plugin/shutdown/src/main/java/org/elasticsearch/xpack/shutdown/ShutdownPlugin.java b/x-pack/plugin/shutdown/src/main/java/org/elasticsearch/xpack/shutdown/ShutdownPlugin.java index 2b796bdda060f..5fa7da8312263 100644 --- a/x-pack/plugin/shutdown/src/main/java/org/elasticsearch/xpack/shutdown/ShutdownPlugin.java +++ b/x-pack/plugin/shutdown/src/main/java/org/elasticsearch/xpack/shutdown/ShutdownPlugin.java @@ -10,19 +10,30 @@ import org.elasticsearch.action.ActionRequest; import org.elasticsearch.action.ActionResponse; import org.elasticsearch.action.support.master.AcknowledgedResponse; +import org.elasticsearch.client.Client; import org.elasticsearch.cluster.metadata.IndexNameExpressionResolver; import org.elasticsearch.cluster.node.DiscoveryNodes; +import org.elasticsearch.cluster.service.ClusterService; +import org.elasticsearch.common.io.stream.NamedWriteableRegistry; import org.elasticsearch.common.settings.ClusterSettings; import org.elasticsearch.common.settings.IndexScopedSettings; import org.elasticsearch.common.settings.Setting; import org.elasticsearch.common.settings.Settings; import org.elasticsearch.common.settings.SettingsFilter; +import org.elasticsearch.common.xcontent.NamedXContentRegistry; +import org.elasticsearch.env.Environment; +import org.elasticsearch.env.NodeEnvironment; import org.elasticsearch.plugins.ActionPlugin; import org.elasticsearch.plugins.Plugin; +import org.elasticsearch.repositories.RepositoriesService; import org.elasticsearch.rest.RestController; import org.elasticsearch.rest.RestHandler; +import org.elasticsearch.script.ScriptService; +import org.elasticsearch.threadpool.ThreadPool; +import org.elasticsearch.watcher.ResourceWatcherService; import java.util.Arrays; +import java.util.Collection; import java.util.Collections; import java.util.List; import java.util.function.Supplier; @@ -56,6 +67,26 @@ public boolean isEnabled(Settings settings) { return SHUTDOWN_FEATURE_ENABLED_FLAG_SETTING.get(settings); } + @Override + public Collection createComponents( + Client client, + ClusterService clusterService, + ThreadPool threadPool, + ResourceWatcherService resourceWatcherService, + ScriptService scriptService, + NamedXContentRegistry xContentRegistry, + Environment environment, + NodeEnvironment nodeEnvironment, + NamedWriteableRegistry namedWriteableRegistry, + IndexNameExpressionResolver indexNameExpressionResolver, + Supplier repositoriesServiceSupplier + ) { + + NodeSeenService nodeSeenService = new NodeSeenService(clusterService); + + return Collections.singletonList(nodeSeenService); + } + @Override public List> getActions() { ActionHandler putShutdown = new ActionHandler<>( diff --git a/x-pack/plugin/shutdown/src/main/java/org/elasticsearch/xpack/shutdown/TransportGetShutdownStatusAction.java b/x-pack/plugin/shutdown/src/main/java/org/elasticsearch/xpack/shutdown/TransportGetShutdownStatusAction.java index fd480fecdfde7..b8a04207d7188 100644 --- a/x-pack/plugin/shutdown/src/main/java/org/elasticsearch/xpack/shutdown/TransportGetShutdownStatusAction.java +++ b/x-pack/plugin/shutdown/src/main/java/org/elasticsearch/xpack/shutdown/TransportGetShutdownStatusAction.java @@ -110,10 +110,11 @@ protected void masterOperation( state, ns.getNodeId(), ns.getType(), - allocationDeciders, + ns.getNodeSeen(), clusterInfoService, snapshotsInfoService, - allocationService + allocationService, + allocationDeciders ), new ShutdownPersistentTasksStatus(), new ShutdownPluginsStatus(pluginShutdownService.readyToShutdown(ns.getNodeId(), ns.getType())) @@ -134,10 +135,11 @@ protected void masterOperation( state, ns.getNodeId(), ns.getType(), - allocationDeciders, + ns.getNodeSeen(), clusterInfoService, snapshotsInfoService, - allocationService + allocationService, + allocationDeciders ), new ShutdownPersistentTasksStatus(), new ShutdownPluginsStatus(pluginShutdownService.readyToShutdown(ns.getNodeId(), ns.getType())) @@ -156,10 +158,11 @@ static ShutdownShardMigrationStatus shardMigrationStatus( ClusterState currentState, String nodeId, SingleNodeShutdownMetadata.Type shutdownType, - AllocationDeciders allocationDeciders, + boolean nodeSeen, ClusterInfoService clusterInfoService, SnapshotsInfoService snapshotsInfoService, - AllocationService allocationService + AllocationService allocationService, + AllocationDeciders allocationDeciders ) { // Only REMOVE-type shutdowns will try to move shards, so RESTART-type shutdowns should immediately complete if (SingleNodeShutdownMetadata.Type.RESTART.equals(shutdownType)) { @@ -170,8 +173,8 @@ static ShutdownShardMigrationStatus shardMigrationStatus( ); } - if (currentState.getRoutingNodes().node(nodeId) == null) { - // We don't know about that node + if (currentState.nodes().get(nodeId) == null && nodeSeen == false) { + // The node isn't in the cluster return new ShutdownShardMigrationStatus( SingleNodeShutdownMetadata.Status.NOT_STARTED, 0, @@ -179,6 +182,12 @@ static ShutdownShardMigrationStatus shardMigrationStatus( ); } + // The node is in `DiscoveryNodes`, but not `RoutingNodes` - so there are no shards assigned to it. We're done. + if (currentState.getRoutingNodes().node(nodeId) == null) { + // We don't know about that node + return new ShutdownShardMigrationStatus(SingleNodeShutdownMetadata.Status.COMPLETE, 0); + } + // First, check if there are any shards currently on this node, and if there are any relocating shards int startedShards = currentState.getRoutingNodes().node(nodeId).numberOfShardsWithState(ShardRoutingState.STARTED); int relocatingShards = currentState.getRoutingNodes().node(nodeId).numberOfShardsWithState(ShardRoutingState.RELOCATING); diff --git a/x-pack/plugin/shutdown/src/main/java/org/elasticsearch/xpack/shutdown/TransportPutShutdownNodeAction.java b/x-pack/plugin/shutdown/src/main/java/org/elasticsearch/xpack/shutdown/TransportPutShutdownNodeAction.java index 146fbbcf2561f..2b3d81ca01f8a 100644 --- a/x-pack/plugin/shutdown/src/main/java/org/elasticsearch/xpack/shutdown/TransportPutShutdownNodeAction.java +++ b/x-pack/plugin/shutdown/src/main/java/org/elasticsearch/xpack/shutdown/TransportPutShutdownNodeAction.java @@ -77,11 +77,14 @@ public ClusterState execute(ClusterState currentState) { ); } + final boolean nodeSeen = currentState.getNodes().nodeExists(request.getNodeId()); + SingleNodeShutdownMetadata newNodeMetadata = SingleNodeShutdownMetadata.builder() .setNodeId(request.getNodeId()) .setType(request.getType()) .setReason(request.getReason()) .setStartedAtMillis(System.currentTimeMillis()) + .setNodeSeen(nodeSeen) .setAllocationDelay(request.getAllocationDelay()) .build(); diff --git a/x-pack/plugin/shutdown/src/test/java/org/elasticsearch/xpack/shutdown/TransportGetShutdownStatusActionTests.java b/x-pack/plugin/shutdown/src/test/java/org/elasticsearch/xpack/shutdown/TransportGetShutdownStatusActionTests.java index c45231f810144..cd4435db25c8b 100644 --- a/x-pack/plugin/shutdown/src/test/java/org/elasticsearch/xpack/shutdown/TransportGetShutdownStatusActionTests.java +++ b/x-pack/plugin/shutdown/src/test/java/org/elasticsearch/xpack/shutdown/TransportGetShutdownStatusActionTests.java @@ -131,10 +131,11 @@ public void testEmptyCluster() { state, SHUTTING_DOWN_NODE_ID, SingleNodeShutdownMetadata.Type.REMOVE, - allocationDeciders, + false, clusterInfoService, snapshotsInfoService, - allocationService + allocationService, + allocationDeciders ); assertShardMigration(status, SingleNodeShutdownMetadata.Status.COMPLETE, 0, nullValue()); @@ -164,10 +165,11 @@ public void testRestartAlwaysComplete() { state, SHUTTING_DOWN_NODE_ID, SingleNodeShutdownMetadata.Type.RESTART, - allocationDeciders, + randomBoolean(), // Whether the node has been seen doesn't matter, restart-type shutdowns should always say COMPLETE here. clusterInfoService, snapshotsInfoService, - allocationService + allocationService, + allocationDeciders ); assertShardMigration( @@ -203,10 +205,11 @@ public void testComplete() { state, SHUTTING_DOWN_NODE_ID, SingleNodeShutdownMetadata.Type.REMOVE, - allocationDeciders, + true, clusterInfoService, snapshotsInfoService, - allocationService + allocationService, + allocationDeciders ); assertShardMigration(status, SingleNodeShutdownMetadata.Status.COMPLETE, 0, nullValue()); @@ -251,10 +254,11 @@ public void testInProgressWithRelocatingShards() { state, SHUTTING_DOWN_NODE_ID, SingleNodeShutdownMetadata.Type.REMOVE, - allocationDeciders, + true, clusterInfoService, snapshotsInfoService, - allocationService + allocationService, + allocationDeciders ); assertShardMigration(status, SingleNodeShutdownMetadata.Status.IN_PROGRESS, 2, nullValue()); @@ -306,10 +310,11 @@ public void testInProgressWithShardsMovingBetweenOtherNodes() { state, SHUTTING_DOWN_NODE_ID, SingleNodeShutdownMetadata.Type.REMOVE, - allocationDeciders, + true, clusterInfoService, snapshotsInfoService, - allocationService + allocationService, + allocationDeciders ); assertShardMigration(status, SingleNodeShutdownMetadata.Status.IN_PROGRESS, 1, nullValue()); @@ -345,10 +350,11 @@ public void testStalled() { state, SHUTTING_DOWN_NODE_ID, SingleNodeShutdownMetadata.Type.REMOVE, - allocationDeciders, + true, clusterInfoService, snapshotsInfoService, - allocationService + allocationService, + allocationDeciders ); assertShardMigration( @@ -380,10 +386,11 @@ public void testOnlyInitializingShardsRemaining() { state, SHUTTING_DOWN_NODE_ID, SingleNodeShutdownMetadata.Type.REMOVE, - allocationDeciders, + true, clusterInfoService, snapshotsInfoService, - allocationService + allocationService, + allocationDeciders ); assertShardMigration( @@ -455,10 +462,11 @@ public void testNodeNotInCluster() { state, bogusNodeId, SingleNodeShutdownMetadata.Type.REMOVE, - allocationDeciders, + false, clusterInfoService, snapshotsInfoService, - allocationService + allocationService, + allocationDeciders ); assertShardMigration(status, SingleNodeShutdownMetadata.Status.NOT_STARTED, 0, is("node is not currently part of the cluster"));