diff --git a/pkg/schedule/checker/rule_checker.go b/pkg/schedule/checker/rule_checker.go index c8dac448564..c7e27092346 100644 --- a/pkg/schedule/checker/rule_checker.go +++ b/pkg/schedule/checker/rule_checker.go @@ -556,6 +556,7 @@ func (c *RuleChecker) fixOrphanPeers(region *core.RegionInfo, fit *placement.Reg } } + extra := fit.ExtraCount() // If hasUnhealthyFit is true, try to remove unhealthy orphan peers only if number of OrphanPeers is >= 2. // Ref https://github.com/tikv/pd/issues/4045 if len(fit.OrphanPeers) >= 2 { @@ -572,7 +573,8 @@ func (c *RuleChecker) fixOrphanPeers(region *core.RegionInfo, fit *placement.Reg ruleCheckerRemoveOrphanPeerCounter.Inc() return operator.CreateRemovePeerOperator("remove-unhealthy-orphan-peer", c.cluster, 0, region, orphanPeer.StoreId) } - if hasHealthPeer { + // The healthy orphan peer can be removed to keep the high availability only if the peer count is greater than the rule requirement. + if hasHealthPeer && extra > 0 { // there already exists a healthy orphan peer, so we can remove other orphan Peers. ruleCheckerRemoveOrphanPeerCounter.Inc() // if there exists a disconnected orphan peer, we will pick it to remove firstly. diff --git a/pkg/schedule/checker/rule_checker_test.go b/pkg/schedule/checker/rule_checker_test.go index d75841bcf83..d4d37de2c3c 100644 --- a/pkg/schedule/checker/rule_checker_test.go +++ b/pkg/schedule/checker/rule_checker_test.go @@ -2025,3 +2025,39 @@ func (suite *ruleCheckerTestAdvancedSuite) TestReplaceAnExistingPeerCases() { suite.ruleManager.DeleteGroupBundle(groupName, false) } } + +func (suite *ruleCheckerTestSuite) TestRemoveOrphanPeer() { + suite.cluster.AddLabelsStore(1, 1, map[string]string{"zone": "z1", "host": "h1"}) + suite.cluster.AddLabelsStore(2, 1, map[string]string{"zone": "z1", "host": "h1"}) + suite.cluster.AddLabelsStore(3, 1, map[string]string{"zone": "z1", "host": "h1"}) + suite.cluster.AddLabelsStore(4, 1, map[string]string{"zone": "z2", "host": "h1"}) + suite.cluster.AddLabelsStore(5, 1, map[string]string{"zone": "z2", "host": "h2"}) + suite.cluster.AddLabelsStore(6, 1, map[string]string{"zone": "z2", "host": "h2"}) + rule := &placement.Rule{ + GroupID: "pd", + ID: "test2", + Role: placement.Voter, + Count: 3, + LabelConstraints: []placement.LabelConstraint{ + { + Key: "zone", + Op: placement.In, + Values: []string{"z2"}, + }, + }, + } + suite.ruleManager.SetRule(rule) + suite.ruleManager.DeleteRule("pd", "default") + + // case1: regionA has 3 peers but not extra peer can be removed, so it needs to add peer first + suite.cluster.AddLeaderRegionWithRange(1, "200", "300", 1, 2, 3) + op := suite.rc.Check(suite.cluster.GetRegion(1)) + suite.NotNil(op) + suite.Equal("add-rule-peer", op.Desc()) + + // case2: regionB has 4 peers and one extra peer can be removed, so it needs to remove extra peer first + suite.cluster.AddLeaderRegionWithRange(2, "300", "400", 1, 2, 3, 4) + op = suite.rc.Check(suite.cluster.GetRegion(2)) + suite.NotNil(op) + suite.Equal("remove-orphan-peer", op.Desc()) +} diff --git a/pkg/schedule/placement/fit.go b/pkg/schedule/placement/fit.go index 45afc5bcfa3..d907bcd011a 100644 --- a/pkg/schedule/placement/fit.go +++ b/pkg/schedule/placement/fit.go @@ -93,6 +93,15 @@ func (f *RegionFit) IsSatisfied() bool { return len(f.OrphanPeers) == 0 } +// ExtraCount return the extra count. +func (f *RegionFit) ExtraCount() int { + desired := 0 + for _, r := range f.RuleFits { + desired += r.Rule.Count + } + return len(f.regionStores) - desired +} + // GetRuleFit returns the RuleFit that contains the peer. func (f *RegionFit) GetRuleFit(peerID uint64) *RuleFit { for _, rf := range f.RuleFits {