Skip to content

Commit

Permalink
[Bugfix] Fix NPE in ReplicaShardAllocator (#13993) (#14385)
Browse files Browse the repository at this point in the history
* [Bugfix] Fix NPE in ReplicaShardAllocator (#13993)

Signed-off-by: Daniil Roman <daniilroman.cv@gmail.com>

* Add fix info to CHANGELOG.md

Signed-off-by: Daniil Roman <danroman17397@gmail.com>

---------

Signed-off-by: Daniil Roman <daniilroman.cv@gmail.com>
Signed-off-by: Daniil Roman <danroman17397@gmail.com>
(cherry picked from commit cf0d6cc)
Signed-off-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com>
  • Loading branch information
github-actions[bot] committed Jul 18, 2024
1 parent b366c51 commit 947dc86
Show file tree
Hide file tree
Showing 3 changed files with 59 additions and 1 deletion.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
- Refactoring Grok.validatePatternBank by using an iterative approach ([#14206](https://github.com/opensearch-project/OpenSearch/pull/14206))
- Update help output for _cat ([#14722](https://github.com/opensearch-project/OpenSearch/pull/14722))
- Fix bulk upsert ignores the default_pipeline and final_pipeline when auto-created index matches the index template ([#12891](https://github.com/opensearch-project/OpenSearch/pull/12891))
- Fix NPE in ReplicaShardAllocator ([#14385](https://github.com/opensearch-project/OpenSearch/pull/14385))

### Security

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,10 @@ protected Runnable cancelExistingRecoveryForBetterMatch(
Metadata metadata = allocation.metadata();
RoutingNodes routingNodes = allocation.routingNodes();
ShardRouting primaryShard = allocation.routingNodes().activePrimary(shard.shardId());
assert primaryShard != null : "the replica shard can be allocated on at least one node, so there must be an active primary";
if (primaryShard == null) {
logger.trace("{}: no active primary shard found or allocated, letting actual allocation figure it out", shard);
return null;
}
assert primaryShard.currentNodeId() != null;
final DiscoveryNode primaryNode = allocation.nodes().get(primaryShard.currentNodeId());

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -644,6 +644,25 @@ public void testDoNotCancelForBrokenNode() {
assertThat(allocation.routingNodes().shardsWithState(ShardRoutingState.UNASSIGNED), empty());
}

public void testDoNotCancelForInactivePrimaryNode() {
RoutingAllocation allocation = oneInactivePrimaryOnNode1And1ReplicaRecovering(yesAllocationDeciders(), null);
testBatchAllocator.addData(
node1,
null,
"MATCH",
null,
new StoreFileMetadata("file1", 10, "MATCH_CHECKSUM", MIN_SUPPORTED_LUCENE_VERSION)
).addData(node2, randomSyncId(), null, new StoreFileMetadata("file1", 10, "MATCH_CHECKSUM", MIN_SUPPORTED_LUCENE_VERSION));

testBatchAllocator.processExistingRecoveries(
allocation,
Collections.singletonList(new ArrayList<>(allocation.routingNodes().shardsWithState(ShardRoutingState.INITIALIZING)))
);

assertThat(allocation.routingNodesChanged(), equalTo(false));
assertThat(allocation.routingNodes().shardsWithState(ShardRoutingState.UNASSIGNED), empty());
}

public void testAllocateUnassignedBatchThrottlingAllocationDeciderIsHonoured() throws InterruptedException {
ClusterSettings clusterSettings = new ClusterSettings(Settings.EMPTY, ClusterSettings.BUILT_IN_CLUSTER_SETTINGS);
AllocationDeciders allocationDeciders = randomAllocationDeciders(
Expand Down Expand Up @@ -872,6 +891,41 @@ private RoutingAllocation onePrimaryOnNode1And1ReplicaRecovering(AllocationDecid
);
}

private RoutingAllocation oneInactivePrimaryOnNode1And1ReplicaRecovering(AllocationDeciders deciders, UnassignedInfo unassignedInfo) {
ShardRouting primaryShard = TestShardRouting.newShardRouting(shardId, node1.getId(), true, ShardRoutingState.INITIALIZING);
RoutingTable routingTable = RoutingTable.builder()
.add(
IndexRoutingTable.builder(shardId.getIndex())
.addIndexShard(
new IndexShardRoutingTable.Builder(shardId).addShard(primaryShard)
.addShard(
TestShardRouting.newShardRouting(
shardId,
node2.getId(),
null,
false,
ShardRoutingState.INITIALIZING,
unassignedInfo
)
)
.build()
)
)
.build();
ClusterState state = ClusterState.builder(org.opensearch.cluster.ClusterName.CLUSTER_NAME_SETTING.getDefault(Settings.EMPTY))
.routingTable(routingTable)
.nodes(DiscoveryNodes.builder().add(node1).add(node2))
.build();
return new RoutingAllocation(
deciders,
new RoutingNodes(state, false),
state,
ClusterInfo.EMPTY,
SnapshotShardSizeInfo.EMPTY,
System.nanoTime()
);
}

private RoutingAllocation onePrimaryOnNode1And1ReplicaRecovering(AllocationDeciders deciders) {
return onePrimaryOnNode1And1ReplicaRecovering(deciders, new UnassignedInfo(UnassignedInfo.Reason.CLUSTER_RECOVERED, null));
}
Expand Down

0 comments on commit 947dc86

Please sign in to comment.