-
Notifications
You must be signed in to change notification settings - Fork 24.9k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Cancel recoveries even if all shards assigned #46520
Changes from 12 commits
89978bc
cac38ad
e436afb
51d1bb3
be2546b
28d2e7c
c3e1387
bf5248f
dc25223
45da6a3
3d90508
0cff47a
f83dcb5
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||||||
---|---|---|---|---|---|---|---|---|
|
@@ -324,6 +324,81 @@ public void testReplicaRecovery() throws Exception { | |||||||
assertHitCount(client().prepareSearch(INDEX_NAME).setSize(0).get(), numOfDocs); | ||||||||
} | ||||||||
|
||||||||
public void testCancelNewShardRecoveryAndUsesExistingShardCopy() throws Exception { | ||||||||
logger.info("--> start node A"); | ||||||||
final String nodeA = internalCluster().startNode(); | ||||||||
|
||||||||
logger.info("--> create index on node: {}", nodeA); | ||||||||
ByteSizeValue shardSize = createAndPopulateIndex(INDEX_NAME, 1, SHARD_COUNT, REPLICA_COUNT) | ||||||||
.getShards()[0].getStats().getStore().size(); | ||||||||
|
||||||||
logger.info("--> start node B"); | ||||||||
// force a shard recovery from nodeA to nodeB | ||||||||
final String nodeB = internalCluster().startNode(); | ||||||||
Settings nodeBDataPathSettings = internalCluster().dataPathSettings(nodeB); | ||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||||
|
||||||||
logger.info("--> add replica for {} on node: {}", INDEX_NAME, nodeB); | ||||||||
assertAcked(client().admin().indices().prepareUpdateSettings(INDEX_NAME) | ||||||||
.setSettings(Settings.builder() | ||||||||
.put(IndexMetaData.SETTING_NUMBER_OF_REPLICAS, 1) | ||||||||
.put(UnassignedInfo.INDEX_DELAYED_NODE_LEFT_TIMEOUT_SETTING.getKey(), 0))); | ||||||||
ensureGreen(INDEX_NAME); | ||||||||
|
||||||||
logger.info("--> start node C"); | ||||||||
final String nodeC = internalCluster().startNode(); | ||||||||
assertFalse(client().admin().cluster().prepareHealth().setWaitForNodes("3").get().isTimedOut()); | ||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'd normally recommend the shorthand
Suggested change
but I don't think this is necessary:
Therefore I think we can drop this:
Suggested change
|
||||||||
|
||||||||
// do sync flush to gen sync id | ||||||||
assertThat(client().admin().indices().prepareSyncedFlush(INDEX_NAME).get().failedShards(), equalTo(0)); | ||||||||
|
||||||||
// hold peer recovery on phase 2 after nodeB down | ||||||||
CountDownLatch allowToCompletePhase1Latch = new CountDownLatch(1); | ||||||||
MockTransportService transportService = (MockTransportService) internalCluster().getInstance(TransportService.class, nodeA); | ||||||||
transportService.addSendBehavior((connection, requestId, action, request, options) -> { | ||||||||
if (PeerRecoveryTargetService.Actions.CLEAN_FILES.equals(action)) { | ||||||||
try { | ||||||||
allowToCompletePhase1Latch.await(); | ||||||||
} catch (InterruptedException e) { | ||||||||
throw new AssertionError(e); | ||||||||
} | ||||||||
} | ||||||||
connection.sendRequest(requestId, action, request, options); | ||||||||
}); | ||||||||
|
||||||||
logger.info("--> restart node B"); | ||||||||
internalCluster().restartNode(nodeB, | ||||||||
new InternalTestCluster.RestartCallback() { | ||||||||
@Override | ||||||||
public Settings onNodeStopped(String nodeName) throws Exception { | ||||||||
assertBusy(() -> { | ||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 😁 I was just about to note the missing wait here. I think it'd be neater to wait for node A to send its |
||||||||
// nodeB stopped, peer recovery from nodeA to nodeC, it will be cancelled after nodeB get started. | ||||||||
RecoveryResponse response = client().admin().indices().prepareRecoveries(INDEX_NAME).execute().actionGet(); | ||||||||
|
||||||||
List<RecoveryState> recoveryStates = response.shardRecoveryStates().get(INDEX_NAME); | ||||||||
List<RecoveryState> nodeARecoveryStates = findRecoveriesForTargetNode(nodeA, recoveryStates); | ||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think we do not need to say anything about the recoveries on node A. These assertions are true, but not particularly important for this test. |
||||||||
assertThat(nodeARecoveryStates.size(), equalTo(1)); | ||||||||
List<RecoveryState> nodeCRecoveryStates = findRecoveriesForTargetNode(nodeC, recoveryStates); | ||||||||
assertThat(nodeCRecoveryStates.size(), equalTo(1)); | ||||||||
|
||||||||
assertRecoveryState(nodeARecoveryStates.get(0), 0, RecoverySource.EmptyStoreRecoverySource.INSTANCE, | ||||||||
true, Stage.DONE, null, nodeA); | ||||||||
validateIndexRecoveryState(nodeARecoveryStates.get(0).getIndex()); | ||||||||
|
||||||||
assertOnGoingRecoveryState(nodeCRecoveryStates.get(0), 0, PeerRecoverySource.INSTANCE, | ||||||||
false, nodeA, nodeC); | ||||||||
validateIndexRecoveryState(nodeCRecoveryStates.get(0).getIndex()); | ||||||||
}); | ||||||||
|
||||||||
return super.onNodeStopped(nodeName); | ||||||||
} | ||||||||
}); | ||||||||
|
||||||||
// wait for peer recovering from nodeA to nodeB to be finished | ||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It took me some time to work out why this works - I suggest this comment explaining it:
Suggested change
|
||||||||
ensureGreen(); | ||||||||
allowToCompletePhase1Latch.countDown(); | ||||||||
transportService.clearAllRules(); | ||||||||
DaveCTurner marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||||
} | ||||||||
|
||||||||
public void testRerouteRecovery() throws Exception { | ||||||||
logger.info("--> start node A"); | ||||||||
final String nodeA = internalCluster().startNode(); | ||||||||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
shardSize
is unused: