From 449260d2f956faca514f87488e3e5371e699fad0 Mon Sep 17 00:00:00 2001 From: weekface Date: Sat, 28 Sep 2019 09:14:51 +0800 Subject: [PATCH 1/5] add maxFailoverCount limit to TiKV --- .../tidb-cluster/templates/tidb-cluster.yaml | 1 + charts/tidb-cluster/values.yaml | 1 + go.mod | 4 +- go.sum | 8 +- .../tidb-cluster-values.yaml | 1 + pkg/apis/pingcap.com/v1alpha1/types.go | 1 + pkg/manager/member/tikv_failover.go | 8 ++ pkg/manager/member/tikv_failover_test.go | 79 +++++++++++++++++++ 8 files changed, 97 insertions(+), 6 deletions(-) diff --git a/charts/tidb-cluster/templates/tidb-cluster.yaml b/charts/tidb-cluster/templates/tidb-cluster.yaml index 8dab22f3ee..b8cb679761 100644 --- a/charts/tidb-cluster/templates/tidb-cluster.yaml +++ b/charts/tidb-cluster/templates/tidb-cluster.yaml @@ -81,6 +81,7 @@ spec: {{- if .Values.tikv.priorityClassName }} priorityClassName: {{ .Values.tikv.priorityClassName }} {{- end }} + maxFailoverCount: {{ .Values.tikv.maxFailoverCount | default 3 }} tidb: replicas: {{ .Values.tidb.replicas }} image: {{ .Values.tidb.image }} diff --git a/charts/tidb-cluster/values.yaml b/charts/tidb-cluster/values.yaml index 556a59a0db..7561f05422 100644 --- a/charts/tidb-cluster/values.yaml +++ b/charts/tidb-cluster/values.yaml @@ -261,6 +261,7 @@ tikv: # Specify the priorityClassName for TiKV Pod. # refer to https://kubernetes.io/docs/concepts/configuration/pod-priority-preemption/#how-to-use-priority-and-preemption priorityClassName: "" + maxFailoverCount: 3 tidb: # Please refer to https://github.com/pingcap/tidb/blob/master/config/config.toml.example for the default diff --git a/go.mod b/go.mod index 7d9174ec2d..79d65b4f7b 100644 --- a/go.mod +++ b/go.mod @@ -121,8 +121,8 @@ require ( k8s.io/apiserver v0.0.0-20190118115647-a748535592ba k8s.io/cli-runtime v0.0.0-20190118125240-caee4253d968 k8s.io/client-go v2.0.0-alpha.0.0.20190115164855-701b91367003+incompatible - k8s.io/code-generator v0.0.0-20190912042602-ebc0eb3a5c23 - k8s.io/klog v0.4.0 + k8s.io/code-generator v0.0.0-20190927075303-016f2b3d74d0 + k8s.io/klog v1.0.0 k8s.io/kubernetes v1.12.5 k8s.io/metrics v0.0.0-20190118124808-33c1aed8dc65 // indirect k8s.io/utils v0.0.0-20190308190857-21c4ce38f2a7 // indirect diff --git a/go.sum b/go.sum index 65959bcb0c..fc2b6052ae 100644 --- a/go.sum +++ b/go.sum @@ -377,14 +377,14 @@ k8s.io/cli-runtime v0.0.0-20190118125240-caee4253d968 h1:VXLj8aMvJEo14Utv+knJDs0 k8s.io/cli-runtime v0.0.0-20190118125240-caee4253d968/go.mod h1:qWnH3/b8sp/l7EvlDh7ulDU3UWA4P4N1NFbEEP791tM= k8s.io/client-go v2.0.0-alpha.0.0.20190115164855-701b91367003+incompatible h1:Qw/ADzXV2yX+39UUCwNcZmdNS4+sR+V2Jf9NBdZWlQg= k8s.io/client-go v2.0.0-alpha.0.0.20190115164855-701b91367003+incompatible/go.mod h1:7vJpHMYJwNQCWgzmNV+VYUl1zCObLyodBc8nIyt8L5s= -k8s.io/code-generator v0.0.0-20190912042602-ebc0eb3a5c23 h1:2oyDSO/D/4/bch5ZhL+sF5CPxO0GMrXhsIKFFOV6/uo= -k8s.io/code-generator v0.0.0-20190912042602-ebc0eb3a5c23/go.mod h1:V5BD6M4CyaN5m+VthcclXWsVcT1Hu+glwa1bi3MIsyE= +k8s.io/code-generator v0.0.0-20190927075303-016f2b3d74d0 h1:rhwEVFHoBm42V0b7yN9SUdbWzfCVndLzRV8YGIi0uWY= +k8s.io/code-generator v0.0.0-20190927075303-016f2b3d74d0/go.mod h1:4MfOrxyyZxxCuenwsdaJRtoSnOP5T13jE2LRYPZ6KeY= k8s.io/gengo v0.0.0-20190128074634-0689ccc1d7d6/go.mod h1:ezvh/TsK7cY6rbqRK0oQQ8IAqLxYwwyPxAX1Pzy0ii0= k8s.io/gengo v0.0.0-20190822140433-26a664648505 h1:ZY6yclUKVbZ+SdWnkfY+Je5vrMpKOxmGeKRbsXVmqYM= k8s.io/gengo v0.0.0-20190822140433-26a664648505/go.mod h1:ezvh/TsK7cY6rbqRK0oQQ8IAqLxYwwyPxAX1Pzy0ii0= k8s.io/klog v0.0.0-20181102134211-b9b56d5dfc92/go.mod h1:Gq+BEi5rUBO/HRz0bTSXDUcqjScdoY3a9IHpCEIOOfk= -k8s.io/klog v0.4.0 h1:lCJCxf/LIowc2IGS9TPjWDyXY4nOmdGdfcwwDQCOURQ= -k8s.io/klog v0.4.0/go.mod h1:4Bi6QPql/J/LkTDqv7R/cd3hPo4k2DG6Ptcz060Ez5I= +k8s.io/klog v1.0.0 h1:Pt+yjF5aB1xDSVbau4VsWe+dQNzA0qv1LlXdC2dF6Q8= +k8s.io/klog v1.0.0/go.mod h1:4Bi6QPql/J/LkTDqv7R/cd3hPo4k2DG6Ptcz060Ez5I= k8s.io/kube-openapi v0.0.0-20190816220812-743ec37842bf h1:EYm5AW/UUDbnmnI+gK0TJDVK9qPLhM+sRHYanNKw0EQ= k8s.io/kube-openapi v0.0.0-20190816220812-743ec37842bf/go.mod h1:1TqjTSzOxsLGIKfj0lK8EeCP7K1iUG65v09OM0/WG5E= k8s.io/kubernetes v1.12.5 h1:pdQvCJZPGRNVS3CaajKuoPCZKreQaglbRcXwkDwR598= diff --git a/images/tidb-operator-e2e/tidb-cluster-values.yaml b/images/tidb-operator-e2e/tidb-cluster-values.yaml index 993afcd209..807f3a91cf 100644 --- a/images/tidb-operator-e2e/tidb-cluster-values.yaml +++ b/images/tidb-operator-e2e/tidb-cluster-values.yaml @@ -112,6 +112,7 @@ tikv: # value: tidb # effect: "NoSchedule" annotations: {} + maxFailoverCount: 3 tikvPromGateway: image: prom/pushgateway:v0.3.1 diff --git a/pkg/apis/pingcap.com/v1alpha1/types.go b/pkg/apis/pingcap.com/v1alpha1/types.go index 8adafec617..ce429faf09 100644 --- a/pkg/apis/pingcap.com/v1alpha1/types.go +++ b/pkg/apis/pingcap.com/v1alpha1/types.go @@ -138,6 +138,7 @@ type TiKVSpec struct { Replicas int32 `json:"replicas"` Privileged bool `json:"privileged,omitempty"` StorageClassName string `json:"storageClassName,omitempty"` + MaxFailoverCount int32 `json:"maxFailoverCount,omitempty"` } // TiKVPromGatewaySpec runs as a sidecar with TiKVSpec diff --git a/pkg/manager/member/tikv_failover.go b/pkg/manager/member/tikv_failover.go index 9481d991f3..d00b019b2e 100644 --- a/pkg/manager/member/tikv_failover.go +++ b/pkg/manager/member/tikv_failover.go @@ -16,6 +16,7 @@ package member import ( "time" + "github.com/golang/glog" "github.com/pingcap/tidb-operator/pkg/apis/pingcap.com/v1alpha1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" ) @@ -30,6 +31,13 @@ func NewTiKVFailover(tikvFailoverPeriod time.Duration) Failover { } func (tf *tikvFailover) Failover(tc *v1alpha1.TidbCluster) error { + ns := tc.GetNamespace() + tcName := tc.GetName() + if len(tc.Status.TiKV.FailureStores) >= int(tc.Spec.TiKV.MaxFailoverCount) { + glog.Errorf("%s/%s failure stores count reached the limit: %d", ns, tcName, tc.Spec.TiKV.MaxFailoverCount) + return nil + } + for storeID, store := range tc.Status.TiKV.Stores { podName := store.PodName if store.LastTransitionTime.IsZero() { diff --git a/pkg/manager/member/tikv_failover_test.go b/pkg/manager/member/tikv_failover_test.go index 2c82de803e..0834b764ac 100644 --- a/pkg/manager/member/tikv_failover_test.go +++ b/pkg/manager/member/tikv_failover_test.go @@ -34,6 +34,7 @@ func TestTiKVFailoverFailover(t *testing.T) { testFn := func(test *testcase, t *testing.T) { t.Log(test.name) tc := newTidbClusterForPD() + tc.Spec.TiKV.MaxFailoverCount = 3 test.update(tc) tikvFailover := newFakeTiKVFailover() @@ -138,6 +139,84 @@ func TestTiKVFailoverFailover(t *testing.T) { g.Expect(len(tc.Status.TiKV.FailureStores)).To(Equal(1)) }, }, + { + name: "not exceed max failover count", + update: func(tc *v1alpha1.TidbCluster) { + tc.Status.TiKV.Stores = map[string]v1alpha1.TiKVStore{ + "3": { + State: v1alpha1.TiKVStateDown, + PodName: "tikv-3", + LastTransitionTime: metav1.Time{Time: time.Now().Add(-70 * time.Minute)}, + }, + "10": { + State: v1alpha1.TiKVStateUp, + PodName: "tikv-10", + LastTransitionTime: metav1.Time{Time: time.Now().Add(-70 * time.Minute)}, + }, + "11": { + State: v1alpha1.TiKVStateUp, + PodName: "tikv-11", + LastTransitionTime: metav1.Time{Time: time.Now().Add(-61 * time.Minute)}, + }, + } + tc.Status.TiKV.FailureStores = map[string]v1alpha1.TiKVFailureStore{ + "1": { + PodName: "tikv-1", + StoreID: "1", + }, + "2": { + PodName: "tikv-2", + StoreID: "2", + }, + } + }, + err: false, + expectFn: func(tc *v1alpha1.TidbCluster) { + g.Expect(int(tc.Spec.TiKV.Replicas)).To(Equal(3)) + g.Expect(len(tc.Status.TiKV.FailureStores)).To(Equal(3)) + }, + }, + { + name: "exceed max failover count", + update: func(tc *v1alpha1.TidbCluster) { + tc.Status.TiKV.Stores = map[string]v1alpha1.TiKVStore{ + "12": { + State: v1alpha1.TiKVStateDown, + PodName: "tikv-12", + LastTransitionTime: metav1.Time{Time: time.Now().Add(-70 * time.Minute)}, + }, + "13": { + State: v1alpha1.TiKVStateDown, + PodName: "tikv-13", + LastTransitionTime: metav1.Time{Time: time.Now().Add(-61 * time.Minute)}, + }, + "14": { + State: v1alpha1.TiKVStateDown, + PodName: "tikv-14", + LastTransitionTime: metav1.Time{Time: time.Now().Add(-70 * time.Minute)}, + }, + } + tc.Status.TiKV.FailureStores = map[string]v1alpha1.TiKVFailureStore{ + "1": { + PodName: "tikv-1", + StoreID: "1", + }, + "2": { + PodName: "tikv-2", + StoreID: "2", + }, + "3": { + PodName: "tikv-3", + StoreID: "3", + }, + } + }, + err: false, + expectFn: func(tc *v1alpha1.TidbCluster) { + g.Expect(int(tc.Spec.TiKV.Replicas)).To(Equal(3)) + g.Expect(len(tc.Status.TiKV.FailureStores)).To(Equal(3)) + }, + }, } for i := range tests { testFn(&tests[i], t) From d986559725ead7b66879061935911f32753b4adc Mon Sep 17 00:00:00 2001 From: weekface Date: Sun, 29 Sep 2019 10:13:15 +0800 Subject: [PATCH 2/5] address comment --- pkg/manager/member/tikv_failover.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/manager/member/tikv_failover.go b/pkg/manager/member/tikv_failover.go index d00b019b2e..5f515ac546 100644 --- a/pkg/manager/member/tikv_failover.go +++ b/pkg/manager/member/tikv_failover.go @@ -34,7 +34,7 @@ func (tf *tikvFailover) Failover(tc *v1alpha1.TidbCluster) error { ns := tc.GetNamespace() tcName := tc.GetName() if len(tc.Status.TiKV.FailureStores) >= int(tc.Spec.TiKV.MaxFailoverCount) { - glog.Errorf("%s/%s failure stores count reached the limit: %d", ns, tcName, tc.Spec.TiKV.MaxFailoverCount) + glog.Warningf("%s/%s failure stores count reached the limit: %d", ns, tcName, tc.Spec.TiKV.MaxFailoverCount) return nil } From d28aec66ecb96739020517116fb0e1dc542bc925 Mon Sep 17 00:00:00 2001 From: weekface Date: Sun, 29 Sep 2019 11:50:30 +0800 Subject: [PATCH 3/5] address comment --- charts/tidb-cluster/values.yaml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/charts/tidb-cluster/values.yaml b/charts/tidb-cluster/values.yaml index 7561f05422..ce1c2749b8 100644 --- a/charts/tidb-cluster/values.yaml +++ b/charts/tidb-cluster/values.yaml @@ -261,6 +261,9 @@ tikv: # Specify the priorityClassName for TiKV Pod. # refer to https://kubernetes.io/docs/concepts/configuration/pod-priority-preemption/#how-to-use-priority-and-preemption priorityClassName: "" + # When a TiKV node fails, its status turns to `Disconnected`. After 30 minutes, it turns to `Down`. + # After waiting for 5 minutes, TiDB Operator creates a new TiKV node if this TiKV node is still down. + # maxFailoverCount is used to configure the maximum number of TiKV nodes that TiDB Operator can create when failover occurs. maxFailoverCount: 3 tidb: From 84c3b977d1eb676203f86c54afdfd5ff4a406694 Mon Sep 17 00:00:00 2001 From: weekface Date: Sun, 29 Sep 2019 13:04:37 +0800 Subject: [PATCH 4/5] address comment --- pkg/manager/member/tikv_failover.go | 9 +++--- pkg/manager/member/tikv_failover_test.go | 39 +++++++++++++++++++++++- 2 files changed, 43 insertions(+), 5 deletions(-) diff --git a/pkg/manager/member/tikv_failover.go b/pkg/manager/member/tikv_failover.go index 5f515ac546..b150b204fc 100644 --- a/pkg/manager/member/tikv_failover.go +++ b/pkg/manager/member/tikv_failover.go @@ -33,10 +33,6 @@ func NewTiKVFailover(tikvFailoverPeriod time.Duration) Failover { func (tf *tikvFailover) Failover(tc *v1alpha1.TidbCluster) error { ns := tc.GetNamespace() tcName := tc.GetName() - if len(tc.Status.TiKV.FailureStores) >= int(tc.Spec.TiKV.MaxFailoverCount) { - glog.Warningf("%s/%s failure stores count reached the limit: %d", ns, tcName, tc.Spec.TiKV.MaxFailoverCount) - return nil - } for storeID, store := range tc.Status.TiKV.Stores { podName := store.PodName @@ -55,6 +51,11 @@ func (tf *tikvFailover) Failover(tc *v1alpha1.TidbCluster) error { if tc.Status.TiKV.FailureStores == nil { tc.Status.TiKV.FailureStores = map[string]v1alpha1.TiKVFailureStore{} } + if len(tc.Status.TiKV.FailureStores) >= int(tc.Spec.TiKV.MaxFailoverCount) { + glog.Warningf("%s/%s failure stores count reached the limit: %d", ns, tcName, tc.Spec.TiKV.MaxFailoverCount) + return nil + } + tc.Status.TiKV.FailureStores[storeID] = v1alpha1.TiKVFailureStore{ PodName: podName, StoreID: store.ID, diff --git a/pkg/manager/member/tikv_failover_test.go b/pkg/manager/member/tikv_failover_test.go index 0834b764ac..c1c6f3377b 100644 --- a/pkg/manager/member/tikv_failover_test.go +++ b/pkg/manager/member/tikv_failover_test.go @@ -177,7 +177,44 @@ func TestTiKVFailoverFailover(t *testing.T) { }, }, { - name: "exceed max failover count", + name: "exceed max failover count1", + update: func(tc *v1alpha1.TidbCluster) { + tc.Status.TiKV.Stores = map[string]v1alpha1.TiKVStore{ + "3": { + State: v1alpha1.TiKVStateDown, + PodName: "tikv-3", + LastTransitionTime: metav1.Time{Time: time.Now().Add(-70 * time.Minute)}, + }, + "10": { + State: v1alpha1.TiKVStateDown, + PodName: "tikv-10", + LastTransitionTime: metav1.Time{Time: time.Now().Add(-70 * time.Minute)}, + }, + "11": { + State: v1alpha1.TiKVStateUp, + PodName: "tikv-11", + LastTransitionTime: metav1.Time{Time: time.Now().Add(-61 * time.Minute)}, + }, + } + tc.Status.TiKV.FailureStores = map[string]v1alpha1.TiKVFailureStore{ + "1": { + PodName: "tikv-1", + StoreID: "1", + }, + "2": { + PodName: "tikv-2", + StoreID: "2", + }, + } + }, + err: false, + expectFn: func(tc *v1alpha1.TidbCluster) { + g.Expect(int(tc.Spec.TiKV.Replicas)).To(Equal(3)) + g.Expect(len(tc.Status.TiKV.FailureStores)).To(Equal(3)) + }, + }, + { + name: "exceed max failover count2", update: func(tc *v1alpha1.TidbCluster) { tc.Status.TiKV.Stores = map[string]v1alpha1.TiKVStore{ "12": { From 08fd07cd21383d286e5beffdd14e32ec60dd3f1c Mon Sep 17 00:00:00 2001 From: weekface Date: Sun, 29 Sep 2019 14:11:06 +0800 Subject: [PATCH 5/5] address comment --- pkg/manager/member/tidb_failover.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/manager/member/tidb_failover.go b/pkg/manager/member/tidb_failover.go index 4fd8e5cefd..0bb10264e3 100644 --- a/pkg/manager/member/tidb_failover.go +++ b/pkg/manager/member/tidb_failover.go @@ -46,7 +46,7 @@ func (tf *tidbFailover) Failover(tc *v1alpha1.TidbCluster) error { } if len(tc.Status.TiDB.FailureMembers) >= int(tc.Spec.TiDB.MaxFailoverCount) { - glog.Errorf("the failure members count reached the limit:%d", tc.Spec.TiDB.MaxFailoverCount) + glog.Warningf("the failure members count reached the limit:%d", tc.Spec.TiDB.MaxFailoverCount) return nil } for _, tidbMember := range tc.Status.TiDB.Members {