From f36e194381e128a73a70f8db5baf9712be3d8d9d Mon Sep 17 00:00:00 2001 From: bufferflies <1045931706@qq.com> Date: Tue, 19 Dec 2023 19:18:54 +0800 Subject: [PATCH 1/3] remove orphan peer only when the peers is greater than the rule count Signed-off-by: bufferflies <1045931706@qq.com> --- pkg/schedule/checker/rule_checker.go | 4 ++- pkg/schedule/checker/rule_checker_test.go | 34 +++++++++++++++++++++++ pkg/schedule/placement/fit.go | 9 ++++++ 3 files changed, 46 insertions(+), 1 deletion(-) diff --git a/pkg/schedule/checker/rule_checker.go b/pkg/schedule/checker/rule_checker.go index c8dac448564..322dad11dbd 100644 --- a/pkg/schedule/checker/rule_checker.go +++ b/pkg/schedule/checker/rule_checker.go @@ -556,6 +556,7 @@ func (c *RuleChecker) fixOrphanPeers(region *core.RegionInfo, fit *placement.Reg } } + extra := fit.ExtraCount() // If hasUnhealthyFit is true, try to remove unhealthy orphan peers only if number of OrphanPeers is >= 2. // Ref https://github.com/tikv/pd/issues/4045 if len(fit.OrphanPeers) >= 2 { @@ -572,7 +573,8 @@ func (c *RuleChecker) fixOrphanPeers(region *core.RegionInfo, fit *placement.Reg ruleCheckerRemoveOrphanPeerCounter.Inc() return operator.CreateRemovePeerOperator("remove-unhealthy-orphan-peer", c.cluster, 0, region, orphanPeer.StoreId) } - if hasHealthPeer { + // If there is one extra peer but all the count of all the rule is not satisfied, we should not remove any peer to keep the high available. + if hasHealthPeer && extra > 0 { // there already exists a healthy orphan peer, so we can remove other orphan Peers. ruleCheckerRemoveOrphanPeerCounter.Inc() // if there exists a disconnected orphan peer, we will pick it to remove firstly. diff --git a/pkg/schedule/checker/rule_checker_test.go b/pkg/schedule/checker/rule_checker_test.go index d75841bcf83..7cd14af6cde 100644 --- a/pkg/schedule/checker/rule_checker_test.go +++ b/pkg/schedule/checker/rule_checker_test.go @@ -2025,3 +2025,37 @@ func (suite *ruleCheckerTestAdvancedSuite) TestReplaceAnExistingPeerCases() { suite.ruleManager.DeleteGroupBundle(groupName, false) } } + +func (suite *ruleCheckerTestSuite) TestRemoveOrphanPeer() { + suite.cluster.AddLabelsStore(1, 1, map[string]string{"zone": "z1", "host": "h1"}) + suite.cluster.AddLabelsStore(2, 1, map[string]string{"zone": "z1", "host": "h1"}) + suite.cluster.AddLabelsStore(3, 1, map[string]string{"zone": "z1", "host": "h1"}) + suite.cluster.AddLabelsStore(4, 1, map[string]string{"zone": "z2", "host": "h1"}) + suite.cluster.AddLabelsStore(5, 1, map[string]string{"zone": "z2", "host": "h2"}) + suite.cluster.AddLabelsStore(6, 1, map[string]string{"zone": "z2", "host": "h2"}) + rule := &placement.Rule{ + GroupID: "pd", + ID: "test2", + Role: placement.Voter, + Count: 3, + LabelConstraints: []placement.LabelConstraint{ + { + Key: "zone", + Op: placement.In, + Values: []string{"z2"}, + }, + }, + } + suite.ruleManager.SetRule(rule) + suite.ruleManager.DeleteRule("pd", "default") + + suite.cluster.AddLeaderRegionWithRange(1, "200", "300", 1, 2, 3) + op := suite.rc.Check(suite.cluster.GetRegion(1)) + suite.NotNil(op) + suite.Equal("add-rule-peer", op.Desc()) + + suite.cluster.AddLeaderRegionWithRange(2, "300", "400", 1, 2, 3, 4) + op = suite.rc.Check(suite.cluster.GetRegion(2)) + suite.NotNil(op) + suite.Equal("remove-orphan-peer", op.Desc()) +} diff --git a/pkg/schedule/placement/fit.go b/pkg/schedule/placement/fit.go index 45afc5bcfa3..8f9870ce658 100644 --- a/pkg/schedule/placement/fit.go +++ b/pkg/schedule/placement/fit.go @@ -93,6 +93,15 @@ func (f *RegionFit) IsSatisfied() bool { return len(f.OrphanPeers) == 0 } +// retrun true if the count of the rule is satisfied. +func (f *RegionFit) ExtraCount() int { + desired := 0 + for _, r := range f.RuleFits { + desired += r.Rule.Count + } + return len(f.regionStores) - desired +} + // GetRuleFit returns the RuleFit that contains the peer. func (f *RegionFit) GetRuleFit(peerID uint64) *RuleFit { for _, rf := range f.RuleFits { From dcd04fd78f8d7a779235712fe64b1a2d7df37c80 Mon Sep 17 00:00:00 2001 From: bufferflies <1045931706@qq.com> Date: Wed, 20 Dec 2023 10:35:03 +0800 Subject: [PATCH 2/3] fix lint Signed-off-by: bufferflies <1045931706@qq.com> --- pkg/schedule/checker/rule_checker.go | 2 +- pkg/schedule/placement/fit.go | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pkg/schedule/checker/rule_checker.go b/pkg/schedule/checker/rule_checker.go index 322dad11dbd..4391e20a775 100644 --- a/pkg/schedule/checker/rule_checker.go +++ b/pkg/schedule/checker/rule_checker.go @@ -573,7 +573,7 @@ func (c *RuleChecker) fixOrphanPeers(region *core.RegionInfo, fit *placement.Reg ruleCheckerRemoveOrphanPeerCounter.Inc() return operator.CreateRemovePeerOperator("remove-unhealthy-orphan-peer", c.cluster, 0, region, orphanPeer.StoreId) } - // If there is one extra peer but all the count of all the rule is not satisfied, we should not remove any peer to keep the high available. + // The healthy orphan peer should be the last one to be removed only if there are extra peers to keep the high availablility. if hasHealthPeer && extra > 0 { // there already exists a healthy orphan peer, so we can remove other orphan Peers. ruleCheckerRemoveOrphanPeerCounter.Inc() diff --git a/pkg/schedule/placement/fit.go b/pkg/schedule/placement/fit.go index 8f9870ce658..d907bcd011a 100644 --- a/pkg/schedule/placement/fit.go +++ b/pkg/schedule/placement/fit.go @@ -93,7 +93,7 @@ func (f *RegionFit) IsSatisfied() bool { return len(f.OrphanPeers) == 0 } -// retrun true if the count of the rule is satisfied. +// ExtraCount return the extra count. func (f *RegionFit) ExtraCount() int { desired := 0 for _, r := range f.RuleFits { From 15f919a917ae24649c508c830d854ba151ff0b77 Mon Sep 17 00:00:00 2001 From: bufferflies <1045931706@qq.com> Date: Wed, 20 Dec 2023 11:41:25 +0800 Subject: [PATCH 3/3] add more readable comment Signed-off-by: bufferflies <1045931706@qq.com> --- pkg/schedule/checker/rule_checker.go | 2 +- pkg/schedule/checker/rule_checker_test.go | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/pkg/schedule/checker/rule_checker.go b/pkg/schedule/checker/rule_checker.go index 4391e20a775..c7e27092346 100644 --- a/pkg/schedule/checker/rule_checker.go +++ b/pkg/schedule/checker/rule_checker.go @@ -573,7 +573,7 @@ func (c *RuleChecker) fixOrphanPeers(region *core.RegionInfo, fit *placement.Reg ruleCheckerRemoveOrphanPeerCounter.Inc() return operator.CreateRemovePeerOperator("remove-unhealthy-orphan-peer", c.cluster, 0, region, orphanPeer.StoreId) } - // The healthy orphan peer should be the last one to be removed only if there are extra peers to keep the high availablility. + // The healthy orphan peer can be removed to keep the high availability only if the peer count is greater than the rule requirement. if hasHealthPeer && extra > 0 { // there already exists a healthy orphan peer, so we can remove other orphan Peers. ruleCheckerRemoveOrphanPeerCounter.Inc() diff --git a/pkg/schedule/checker/rule_checker_test.go b/pkg/schedule/checker/rule_checker_test.go index 7cd14af6cde..d4d37de2c3c 100644 --- a/pkg/schedule/checker/rule_checker_test.go +++ b/pkg/schedule/checker/rule_checker_test.go @@ -2049,11 +2049,13 @@ func (suite *ruleCheckerTestSuite) TestRemoveOrphanPeer() { suite.ruleManager.SetRule(rule) suite.ruleManager.DeleteRule("pd", "default") + // case1: regionA has 3 peers but not extra peer can be removed, so it needs to add peer first suite.cluster.AddLeaderRegionWithRange(1, "200", "300", 1, 2, 3) op := suite.rc.Check(suite.cluster.GetRegion(1)) suite.NotNil(op) suite.Equal("add-rule-peer", op.Desc()) + // case2: regionB has 4 peers and one extra peer can be removed, so it needs to remove extra peer first suite.cluster.AddLeaderRegionWithRange(2, "300", "400", 1, 2, 3, 4) op = suite.rc.Check(suite.cluster.GetRegion(2)) suite.NotNil(op)