From f91fc5833566104fd24e273ce1b4f08786ab5bdd Mon Sep 17 00:00:00 2001 From: weekface Date: Mon, 30 Sep 2019 14:57:25 +0800 Subject: [PATCH] add maxFailoverCount limit to TiKV (#965) --- .../tidb-cluster/templates/tidb-cluster.yaml | 1 + charts/tidb-cluster/values.yaml | 4 + go.mod | 4 +- go.sum | 8 +- .../tidb-cluster-values.yaml | 1 + pkg/apis/pingcap.com/v1alpha1/types.go | 1 + pkg/manager/member/tidb_failover.go | 2 +- pkg/manager/member/tikv_failover.go | 9 ++ pkg/manager/member/tikv_failover_test.go | 116 ++++++++++++++++++ 9 files changed, 139 insertions(+), 7 deletions(-) diff --git a/charts/tidb-cluster/templates/tidb-cluster.yaml b/charts/tidb-cluster/templates/tidb-cluster.yaml index 8dab22f3ee..b8cb679761 100644 --- a/charts/tidb-cluster/templates/tidb-cluster.yaml +++ b/charts/tidb-cluster/templates/tidb-cluster.yaml @@ -81,6 +81,7 @@ spec: {{- if .Values.tikv.priorityClassName }} priorityClassName: {{ .Values.tikv.priorityClassName }} {{- end }} + maxFailoverCount: {{ .Values.tikv.maxFailoverCount | default 3 }} tidb: replicas: {{ .Values.tidb.replicas }} image: {{ .Values.tidb.image }} diff --git a/charts/tidb-cluster/values.yaml b/charts/tidb-cluster/values.yaml index bd6e6f9bb7..b1c98b2431 100644 --- a/charts/tidb-cluster/values.yaml +++ b/charts/tidb-cluster/values.yaml @@ -261,6 +261,10 @@ tikv: # Specify the priorityClassName for TiKV Pod. # refer to https://kubernetes.io/docs/concepts/configuration/pod-priority-preemption/#how-to-use-priority-and-preemption priorityClassName: "" + # When a TiKV node fails, its status turns to `Disconnected`. After 30 minutes, it turns to `Down`. + # After waiting for 5 minutes, TiDB Operator creates a new TiKV node if this TiKV node is still down. + # maxFailoverCount is used to configure the maximum number of TiKV nodes that TiDB Operator can create when failover occurs. + maxFailoverCount: 3 tidb: # Please refer to https://github.com/pingcap/tidb/blob/master/config/config.toml.example for the default diff --git a/go.mod b/go.mod index 7d9174ec2d..79d65b4f7b 100644 --- a/go.mod +++ b/go.mod @@ -121,8 +121,8 @@ require ( k8s.io/apiserver v0.0.0-20190118115647-a748535592ba k8s.io/cli-runtime v0.0.0-20190118125240-caee4253d968 k8s.io/client-go v2.0.0-alpha.0.0.20190115164855-701b91367003+incompatible - k8s.io/code-generator v0.0.0-20190912042602-ebc0eb3a5c23 - k8s.io/klog v0.4.0 + k8s.io/code-generator v0.0.0-20190927075303-016f2b3d74d0 + k8s.io/klog v1.0.0 k8s.io/kubernetes v1.12.5 k8s.io/metrics v0.0.0-20190118124808-33c1aed8dc65 // indirect k8s.io/utils v0.0.0-20190308190857-21c4ce38f2a7 // indirect diff --git a/go.sum b/go.sum index 65959bcb0c..fc2b6052ae 100644 --- a/go.sum +++ b/go.sum @@ -377,14 +377,14 @@ k8s.io/cli-runtime v0.0.0-20190118125240-caee4253d968 h1:VXLj8aMvJEo14Utv+knJDs0 k8s.io/cli-runtime v0.0.0-20190118125240-caee4253d968/go.mod h1:qWnH3/b8sp/l7EvlDh7ulDU3UWA4P4N1NFbEEP791tM= k8s.io/client-go v2.0.0-alpha.0.0.20190115164855-701b91367003+incompatible h1:Qw/ADzXV2yX+39UUCwNcZmdNS4+sR+V2Jf9NBdZWlQg= k8s.io/client-go v2.0.0-alpha.0.0.20190115164855-701b91367003+incompatible/go.mod h1:7vJpHMYJwNQCWgzmNV+VYUl1zCObLyodBc8nIyt8L5s= -k8s.io/code-generator v0.0.0-20190912042602-ebc0eb3a5c23 h1:2oyDSO/D/4/bch5ZhL+sF5CPxO0GMrXhsIKFFOV6/uo= -k8s.io/code-generator v0.0.0-20190912042602-ebc0eb3a5c23/go.mod h1:V5BD6M4CyaN5m+VthcclXWsVcT1Hu+glwa1bi3MIsyE= +k8s.io/code-generator v0.0.0-20190927075303-016f2b3d74d0 h1:rhwEVFHoBm42V0b7yN9SUdbWzfCVndLzRV8YGIi0uWY= +k8s.io/code-generator v0.0.0-20190927075303-016f2b3d74d0/go.mod h1:4MfOrxyyZxxCuenwsdaJRtoSnOP5T13jE2LRYPZ6KeY= k8s.io/gengo v0.0.0-20190128074634-0689ccc1d7d6/go.mod h1:ezvh/TsK7cY6rbqRK0oQQ8IAqLxYwwyPxAX1Pzy0ii0= k8s.io/gengo v0.0.0-20190822140433-26a664648505 h1:ZY6yclUKVbZ+SdWnkfY+Je5vrMpKOxmGeKRbsXVmqYM= k8s.io/gengo v0.0.0-20190822140433-26a664648505/go.mod h1:ezvh/TsK7cY6rbqRK0oQQ8IAqLxYwwyPxAX1Pzy0ii0= k8s.io/klog v0.0.0-20181102134211-b9b56d5dfc92/go.mod h1:Gq+BEi5rUBO/HRz0bTSXDUcqjScdoY3a9IHpCEIOOfk= -k8s.io/klog v0.4.0 h1:lCJCxf/LIowc2IGS9TPjWDyXY4nOmdGdfcwwDQCOURQ= -k8s.io/klog v0.4.0/go.mod h1:4Bi6QPql/J/LkTDqv7R/cd3hPo4k2DG6Ptcz060Ez5I= +k8s.io/klog v1.0.0 h1:Pt+yjF5aB1xDSVbau4VsWe+dQNzA0qv1LlXdC2dF6Q8= +k8s.io/klog v1.0.0/go.mod h1:4Bi6QPql/J/LkTDqv7R/cd3hPo4k2DG6Ptcz060Ez5I= k8s.io/kube-openapi v0.0.0-20190816220812-743ec37842bf h1:EYm5AW/UUDbnmnI+gK0TJDVK9qPLhM+sRHYanNKw0EQ= k8s.io/kube-openapi v0.0.0-20190816220812-743ec37842bf/go.mod h1:1TqjTSzOxsLGIKfj0lK8EeCP7K1iUG65v09OM0/WG5E= k8s.io/kubernetes v1.12.5 h1:pdQvCJZPGRNVS3CaajKuoPCZKreQaglbRcXwkDwR598= diff --git a/images/tidb-operator-e2e/tidb-cluster-values.yaml b/images/tidb-operator-e2e/tidb-cluster-values.yaml index 993afcd209..807f3a91cf 100644 --- a/images/tidb-operator-e2e/tidb-cluster-values.yaml +++ b/images/tidb-operator-e2e/tidb-cluster-values.yaml @@ -112,6 +112,7 @@ tikv: # value: tidb # effect: "NoSchedule" annotations: {} + maxFailoverCount: 3 tikvPromGateway: image: prom/pushgateway:v0.3.1 diff --git a/pkg/apis/pingcap.com/v1alpha1/types.go b/pkg/apis/pingcap.com/v1alpha1/types.go index 8adafec617..ce429faf09 100644 --- a/pkg/apis/pingcap.com/v1alpha1/types.go +++ b/pkg/apis/pingcap.com/v1alpha1/types.go @@ -138,6 +138,7 @@ type TiKVSpec struct { Replicas int32 `json:"replicas"` Privileged bool `json:"privileged,omitempty"` StorageClassName string `json:"storageClassName,omitempty"` + MaxFailoverCount int32 `json:"maxFailoverCount,omitempty"` } // TiKVPromGatewaySpec runs as a sidecar with TiKVSpec diff --git a/pkg/manager/member/tidb_failover.go b/pkg/manager/member/tidb_failover.go index 4fd8e5cefd..0bb10264e3 100644 --- a/pkg/manager/member/tidb_failover.go +++ b/pkg/manager/member/tidb_failover.go @@ -46,7 +46,7 @@ func (tf *tidbFailover) Failover(tc *v1alpha1.TidbCluster) error { } if len(tc.Status.TiDB.FailureMembers) >= int(tc.Spec.TiDB.MaxFailoverCount) { - glog.Errorf("the failure members count reached the limit:%d", tc.Spec.TiDB.MaxFailoverCount) + glog.Warningf("the failure members count reached the limit:%d", tc.Spec.TiDB.MaxFailoverCount) return nil } for _, tidbMember := range tc.Status.TiDB.Members { diff --git a/pkg/manager/member/tikv_failover.go b/pkg/manager/member/tikv_failover.go index 9481d991f3..b150b204fc 100644 --- a/pkg/manager/member/tikv_failover.go +++ b/pkg/manager/member/tikv_failover.go @@ -16,6 +16,7 @@ package member import ( "time" + "github.com/golang/glog" "github.com/pingcap/tidb-operator/pkg/apis/pingcap.com/v1alpha1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" ) @@ -30,6 +31,9 @@ func NewTiKVFailover(tikvFailoverPeriod time.Duration) Failover { } func (tf *tikvFailover) Failover(tc *v1alpha1.TidbCluster) error { + ns := tc.GetNamespace() + tcName := tc.GetName() + for storeID, store := range tc.Status.TiKV.Stores { podName := store.PodName if store.LastTransitionTime.IsZero() { @@ -47,6 +51,11 @@ func (tf *tikvFailover) Failover(tc *v1alpha1.TidbCluster) error { if tc.Status.TiKV.FailureStores == nil { tc.Status.TiKV.FailureStores = map[string]v1alpha1.TiKVFailureStore{} } + if len(tc.Status.TiKV.FailureStores) >= int(tc.Spec.TiKV.MaxFailoverCount) { + glog.Warningf("%s/%s failure stores count reached the limit: %d", ns, tcName, tc.Spec.TiKV.MaxFailoverCount) + return nil + } + tc.Status.TiKV.FailureStores[storeID] = v1alpha1.TiKVFailureStore{ PodName: podName, StoreID: store.ID, diff --git a/pkg/manager/member/tikv_failover_test.go b/pkg/manager/member/tikv_failover_test.go index 2c82de803e..c1c6f3377b 100644 --- a/pkg/manager/member/tikv_failover_test.go +++ b/pkg/manager/member/tikv_failover_test.go @@ -34,6 +34,7 @@ func TestTiKVFailoverFailover(t *testing.T) { testFn := func(test *testcase, t *testing.T) { t.Log(test.name) tc := newTidbClusterForPD() + tc.Spec.TiKV.MaxFailoverCount = 3 test.update(tc) tikvFailover := newFakeTiKVFailover() @@ -138,6 +139,121 @@ func TestTiKVFailoverFailover(t *testing.T) { g.Expect(len(tc.Status.TiKV.FailureStores)).To(Equal(1)) }, }, + { + name: "not exceed max failover count", + update: func(tc *v1alpha1.TidbCluster) { + tc.Status.TiKV.Stores = map[string]v1alpha1.TiKVStore{ + "3": { + State: v1alpha1.TiKVStateDown, + PodName: "tikv-3", + LastTransitionTime: metav1.Time{Time: time.Now().Add(-70 * time.Minute)}, + }, + "10": { + State: v1alpha1.TiKVStateUp, + PodName: "tikv-10", + LastTransitionTime: metav1.Time{Time: time.Now().Add(-70 * time.Minute)}, + }, + "11": { + State: v1alpha1.TiKVStateUp, + PodName: "tikv-11", + LastTransitionTime: metav1.Time{Time: time.Now().Add(-61 * time.Minute)}, + }, + } + tc.Status.TiKV.FailureStores = map[string]v1alpha1.TiKVFailureStore{ + "1": { + PodName: "tikv-1", + StoreID: "1", + }, + "2": { + PodName: "tikv-2", + StoreID: "2", + }, + } + }, + err: false, + expectFn: func(tc *v1alpha1.TidbCluster) { + g.Expect(int(tc.Spec.TiKV.Replicas)).To(Equal(3)) + g.Expect(len(tc.Status.TiKV.FailureStores)).To(Equal(3)) + }, + }, + { + name: "exceed max failover count1", + update: func(tc *v1alpha1.TidbCluster) { + tc.Status.TiKV.Stores = map[string]v1alpha1.TiKVStore{ + "3": { + State: v1alpha1.TiKVStateDown, + PodName: "tikv-3", + LastTransitionTime: metav1.Time{Time: time.Now().Add(-70 * time.Minute)}, + }, + "10": { + State: v1alpha1.TiKVStateDown, + PodName: "tikv-10", + LastTransitionTime: metav1.Time{Time: time.Now().Add(-70 * time.Minute)}, + }, + "11": { + State: v1alpha1.TiKVStateUp, + PodName: "tikv-11", + LastTransitionTime: metav1.Time{Time: time.Now().Add(-61 * time.Minute)}, + }, + } + tc.Status.TiKV.FailureStores = map[string]v1alpha1.TiKVFailureStore{ + "1": { + PodName: "tikv-1", + StoreID: "1", + }, + "2": { + PodName: "tikv-2", + StoreID: "2", + }, + } + }, + err: false, + expectFn: func(tc *v1alpha1.TidbCluster) { + g.Expect(int(tc.Spec.TiKV.Replicas)).To(Equal(3)) + g.Expect(len(tc.Status.TiKV.FailureStores)).To(Equal(3)) + }, + }, + { + name: "exceed max failover count2", + update: func(tc *v1alpha1.TidbCluster) { + tc.Status.TiKV.Stores = map[string]v1alpha1.TiKVStore{ + "12": { + State: v1alpha1.TiKVStateDown, + PodName: "tikv-12", + LastTransitionTime: metav1.Time{Time: time.Now().Add(-70 * time.Minute)}, + }, + "13": { + State: v1alpha1.TiKVStateDown, + PodName: "tikv-13", + LastTransitionTime: metav1.Time{Time: time.Now().Add(-61 * time.Minute)}, + }, + "14": { + State: v1alpha1.TiKVStateDown, + PodName: "tikv-14", + LastTransitionTime: metav1.Time{Time: time.Now().Add(-70 * time.Minute)}, + }, + } + tc.Status.TiKV.FailureStores = map[string]v1alpha1.TiKVFailureStore{ + "1": { + PodName: "tikv-1", + StoreID: "1", + }, + "2": { + PodName: "tikv-2", + StoreID: "2", + }, + "3": { + PodName: "tikv-3", + StoreID: "3", + }, + } + }, + err: false, + expectFn: func(tc *v1alpha1.TidbCluster) { + g.Expect(int(tc.Spec.TiKV.Replicas)).To(Equal(3)) + g.Expect(len(tc.Status.TiKV.FailureStores)).To(Equal(3)) + }, + }, } for i := range tests { testFn(&tests[i], t)