opensearch-project · dblock · Jul 18, 2024 · Jun 16, 2024 · Jul 18, 2024 · Jul 18, 2024
@@ -75,6 +75,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 - Refactoring Grok.validatePatternBank by using an iterative approach ([#14206](https://github.com/opensearch-project/OpenSearch/pull/14206))
 - Update help output for _cat ([#14722](https://github.com/opensearch-project/OpenSearch/pull/14722))
 - Fix bulk upsert ignores the default_pipeline and final_pipeline when auto-created index matches the index template ([#12891](https://github.com/opensearch-project/OpenSearch/pull/12891))
+- Fix NPE in ReplicaShardAllocator ([#14385](https://github.com/opensearch-project/OpenSearch/pull/14385))
 
 ### Security
 

@@ -100,7 +100,10 @@ protected Runnable cancelExistingRecoveryForBetterMatch(
         Metadata metadata = allocation.metadata();
         RoutingNodes routingNodes = allocation.routingNodes();
         ShardRouting primaryShard = allocation.routingNodes().activePrimary(shard.shardId());
-        assert primaryShard != null : "the replica shard can be allocated on at least one node, so there must be an active primary";
 assert primaryShard != null : "the replica shard can be allocated on at least one node, so there must be an active primary"; 
 assert primaryShard.currentNodeId() != null; 
 final DiscoveryNode primaryNode = allocation.nodes().get(primaryShard.currentNodeId()); 
 assert primaryShard != null : "the replica shard can be allocated on at least one node, so there must be an active primary"; 
 assert primaryShard.currentNodeId() != null; 
 final DiscoveryNode primaryNode = allocation.nodes().get(primaryShard.currentNodeId()); 
+        if (primaryShard == null) {
+            logger.trace("{}: no active primary shard found or allocated, letting actual allocation figure it out", shard);
+            return null;
+        }
         assert primaryShard.currentNodeId() != null;
         final DiscoveryNode primaryNode = allocation.nodes().get(primaryShard.currentNodeId());
 

@@ -644,6 +644,25 @@ public void testDoNotCancelForBrokenNode() {
         assertThat(allocation.routingNodes().shardsWithState(ShardRoutingState.UNASSIGNED), empty());
     }
 
+    public void testDoNotCancelForInactivePrimaryNode() {
+        RoutingAllocation allocation = oneInactivePrimaryOnNode1And1ReplicaRecovering(yesAllocationDeciders(), null);
+        testBatchAllocator.addData(
+            node1,
+            null,
+            "MATCH",
+            null,
+            new StoreFileMetadata("file1", 10, "MATCH_CHECKSUM", MIN_SUPPORTED_LUCENE_VERSION)
+        ).addData(node2, randomSyncId(), null, new StoreFileMetadata("file1", 10, "MATCH_CHECKSUM", MIN_SUPPORTED_LUCENE_VERSION));
+
+        testBatchAllocator.processExistingRecoveries(
+            allocation,
+            Collections.singletonList(new ArrayList<>(allocation.routingNodes().shardsWithState(ShardRoutingState.INITIALIZING)))
+        );
+
+        assertThat(allocation.routingNodesChanged(), equalTo(false));
+        assertThat(allocation.routingNodes().shardsWithState(ShardRoutingState.UNASSIGNED), empty());
+    }
+
     public void testAllocateUnassignedBatchThrottlingAllocationDeciderIsHonoured() throws InterruptedException {
         ClusterSettings clusterSettings = new ClusterSettings(Settings.EMPTY, ClusterSettings.BUILT_IN_CLUSTER_SETTINGS);
         AllocationDeciders allocationDeciders = randomAllocationDeciders(
@@ -872,6 +891,41 @@ private RoutingAllocation onePrimaryOnNode1And1ReplicaRecovering(AllocationDecid
         );
     }
 
+    private RoutingAllocation oneInactivePrimaryOnNode1And1ReplicaRecovering(AllocationDeciders deciders, UnassignedInfo unassignedInfo) {
+        ShardRouting primaryShard = TestShardRouting.newShardRouting(shardId, node1.getId(), true, ShardRoutingState.INITIALIZING);
+        RoutingTable routingTable = RoutingTable.builder()
+            .add(
+                IndexRoutingTable.builder(shardId.getIndex())
+                    .addIndexShard(
+                        new IndexShardRoutingTable.Builder(shardId).addShard(primaryShard)
+                            .addShard(
+                                TestShardRouting.newShardRouting(
+                                    shardId,
+                                    node2.getId(),
+                                    null,
+                                    false,
+                                    ShardRoutingState.INITIALIZING,
+                                    unassignedInfo
+                                )
+                            )
+                            .build()
+                    )
+            )
+            .build();
+        ClusterState state = ClusterState.builder(org.opensearch.cluster.ClusterName.CLUSTER_NAME_SETTING.getDefault(Settings.EMPTY))
+            .routingTable(routingTable)
+            .nodes(DiscoveryNodes.builder().add(node1).add(node2))
+            .build();
+        return new RoutingAllocation(
+            deciders,
+            new RoutingNodes(state, false),
+            state,
+            ClusterInfo.EMPTY,
+            SnapshotShardSizeInfo.EMPTY,
+            System.nanoTime()
+        );
+    }
+
     private RoutingAllocation onePrimaryOnNode1And1ReplicaRecovering(AllocationDeciders deciders) {
         return onePrimaryOnNode1And1ReplicaRecovering(deciders, new UnassignedInfo(UnassignedInfo.Reason.CLUSTER_RECOVERED, null));
     }