Skip to content

Commit

Permalink
add maxFailoverCount limit to TiKV (#965)
Browse files Browse the repository at this point in the history
  • Loading branch information
weekface authored and tennix committed Sep 30, 2019
1 parent 65d7d5a commit f91fc58
Show file tree
Hide file tree
Showing 9 changed files with 139 additions and 7 deletions.
1 change: 1 addition & 0 deletions charts/tidb-cluster/templates/tidb-cluster.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,7 @@ spec:
{{- if .Values.tikv.priorityClassName }}
priorityClassName: {{ .Values.tikv.priorityClassName }}
{{- end }}
maxFailoverCount: {{ .Values.tikv.maxFailoverCount | default 3 }}
tidb:
replicas: {{ .Values.tidb.replicas }}
image: {{ .Values.tidb.image }}
Expand Down
4 changes: 4 additions & 0 deletions charts/tidb-cluster/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -261,6 +261,10 @@ tikv:
# Specify the priorityClassName for TiKV Pod.
# refer to https://kubernetes.io/docs/concepts/configuration/pod-priority-preemption/#how-to-use-priority-and-preemption
priorityClassName: ""
# When a TiKV node fails, its status turns to `Disconnected`. After 30 minutes, it turns to `Down`.
# After waiting for 5 minutes, TiDB Operator creates a new TiKV node if this TiKV node is still down.
# maxFailoverCount is used to configure the maximum number of TiKV nodes that TiDB Operator can create when failover occurs.
maxFailoverCount: 3

tidb:
# Please refer to https://github.com/pingcap/tidb/blob/master/config/config.toml.example for the default
Expand Down
4 changes: 2 additions & 2 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -121,8 +121,8 @@ require (
k8s.io/apiserver v0.0.0-20190118115647-a748535592ba
k8s.io/cli-runtime v0.0.0-20190118125240-caee4253d968
k8s.io/client-go v2.0.0-alpha.0.0.20190115164855-701b91367003+incompatible
k8s.io/code-generator v0.0.0-20190912042602-ebc0eb3a5c23
k8s.io/klog v0.4.0
k8s.io/code-generator v0.0.0-20190927075303-016f2b3d74d0
k8s.io/klog v1.0.0
k8s.io/kubernetes v1.12.5
k8s.io/metrics v0.0.0-20190118124808-33c1aed8dc65 // indirect
k8s.io/utils v0.0.0-20190308190857-21c4ce38f2a7 // indirect
Expand Down
8 changes: 4 additions & 4 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -377,14 +377,14 @@ k8s.io/cli-runtime v0.0.0-20190118125240-caee4253d968 h1:VXLj8aMvJEo14Utv+knJDs0
k8s.io/cli-runtime v0.0.0-20190118125240-caee4253d968/go.mod h1:qWnH3/b8sp/l7EvlDh7ulDU3UWA4P4N1NFbEEP791tM=
k8s.io/client-go v2.0.0-alpha.0.0.20190115164855-701b91367003+incompatible h1:Qw/ADzXV2yX+39UUCwNcZmdNS4+sR+V2Jf9NBdZWlQg=
k8s.io/client-go v2.0.0-alpha.0.0.20190115164855-701b91367003+incompatible/go.mod h1:7vJpHMYJwNQCWgzmNV+VYUl1zCObLyodBc8nIyt8L5s=
k8s.io/code-generator v0.0.0-20190912042602-ebc0eb3a5c23 h1:2oyDSO/D/4/bch5ZhL+sF5CPxO0GMrXhsIKFFOV6/uo=
k8s.io/code-generator v0.0.0-20190912042602-ebc0eb3a5c23/go.mod h1:V5BD6M4CyaN5m+VthcclXWsVcT1Hu+glwa1bi3MIsyE=
k8s.io/code-generator v0.0.0-20190927075303-016f2b3d74d0 h1:rhwEVFHoBm42V0b7yN9SUdbWzfCVndLzRV8YGIi0uWY=
k8s.io/code-generator v0.0.0-20190927075303-016f2b3d74d0/go.mod h1:4MfOrxyyZxxCuenwsdaJRtoSnOP5T13jE2LRYPZ6KeY=
k8s.io/gengo v0.0.0-20190128074634-0689ccc1d7d6/go.mod h1:ezvh/TsK7cY6rbqRK0oQQ8IAqLxYwwyPxAX1Pzy0ii0=
k8s.io/gengo v0.0.0-20190822140433-26a664648505 h1:ZY6yclUKVbZ+SdWnkfY+Je5vrMpKOxmGeKRbsXVmqYM=
k8s.io/gengo v0.0.0-20190822140433-26a664648505/go.mod h1:ezvh/TsK7cY6rbqRK0oQQ8IAqLxYwwyPxAX1Pzy0ii0=
k8s.io/klog v0.0.0-20181102134211-b9b56d5dfc92/go.mod h1:Gq+BEi5rUBO/HRz0bTSXDUcqjScdoY3a9IHpCEIOOfk=
k8s.io/klog v0.4.0 h1:lCJCxf/LIowc2IGS9TPjWDyXY4nOmdGdfcwwDQCOURQ=
k8s.io/klog v0.4.0/go.mod h1:4Bi6QPql/J/LkTDqv7R/cd3hPo4k2DG6Ptcz060Ez5I=
k8s.io/klog v1.0.0 h1:Pt+yjF5aB1xDSVbau4VsWe+dQNzA0qv1LlXdC2dF6Q8=
k8s.io/klog v1.0.0/go.mod h1:4Bi6QPql/J/LkTDqv7R/cd3hPo4k2DG6Ptcz060Ez5I=
k8s.io/kube-openapi v0.0.0-20190816220812-743ec37842bf h1:EYm5AW/UUDbnmnI+gK0TJDVK9qPLhM+sRHYanNKw0EQ=
k8s.io/kube-openapi v0.0.0-20190816220812-743ec37842bf/go.mod h1:1TqjTSzOxsLGIKfj0lK8EeCP7K1iUG65v09OM0/WG5E=
k8s.io/kubernetes v1.12.5 h1:pdQvCJZPGRNVS3CaajKuoPCZKreQaglbRcXwkDwR598=
Expand Down
1 change: 1 addition & 0 deletions images/tidb-operator-e2e/tidb-cluster-values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,7 @@ tikv:
# value: tidb
# effect: "NoSchedule"
annotations: {}
maxFailoverCount: 3

tikvPromGateway:
image: prom/pushgateway:v0.3.1
Expand Down
1 change: 1 addition & 0 deletions pkg/apis/pingcap.com/v1alpha1/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -138,6 +138,7 @@ type TiKVSpec struct {
Replicas int32 `json:"replicas"`
Privileged bool `json:"privileged,omitempty"`
StorageClassName string `json:"storageClassName,omitempty"`
MaxFailoverCount int32 `json:"maxFailoverCount,omitempty"`
}

// TiKVPromGatewaySpec runs as a sidecar with TiKVSpec
Expand Down
2 changes: 1 addition & 1 deletion pkg/manager/member/tidb_failover.go
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ func (tf *tidbFailover) Failover(tc *v1alpha1.TidbCluster) error {
}

if len(tc.Status.TiDB.FailureMembers) >= int(tc.Spec.TiDB.MaxFailoverCount) {
glog.Errorf("the failure members count reached the limit:%d", tc.Spec.TiDB.MaxFailoverCount)
glog.Warningf("the failure members count reached the limit:%d", tc.Spec.TiDB.MaxFailoverCount)
return nil
}
for _, tidbMember := range tc.Status.TiDB.Members {
Expand Down
9 changes: 9 additions & 0 deletions pkg/manager/member/tikv_failover.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ package member
import (
"time"

"github.com/golang/glog"
"github.com/pingcap/tidb-operator/pkg/apis/pingcap.com/v1alpha1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
)
Expand All @@ -30,6 +31,9 @@ func NewTiKVFailover(tikvFailoverPeriod time.Duration) Failover {
}

func (tf *tikvFailover) Failover(tc *v1alpha1.TidbCluster) error {
ns := tc.GetNamespace()
tcName := tc.GetName()

for storeID, store := range tc.Status.TiKV.Stores {
podName := store.PodName
if store.LastTransitionTime.IsZero() {
Expand All @@ -47,6 +51,11 @@ func (tf *tikvFailover) Failover(tc *v1alpha1.TidbCluster) error {
if tc.Status.TiKV.FailureStores == nil {
tc.Status.TiKV.FailureStores = map[string]v1alpha1.TiKVFailureStore{}
}
if len(tc.Status.TiKV.FailureStores) >= int(tc.Spec.TiKV.MaxFailoverCount) {
glog.Warningf("%s/%s failure stores count reached the limit: %d", ns, tcName, tc.Spec.TiKV.MaxFailoverCount)
return nil
}

tc.Status.TiKV.FailureStores[storeID] = v1alpha1.TiKVFailureStore{
PodName: podName,
StoreID: store.ID,
Expand Down
116 changes: 116 additions & 0 deletions pkg/manager/member/tikv_failover_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ func TestTiKVFailoverFailover(t *testing.T) {
testFn := func(test *testcase, t *testing.T) {
t.Log(test.name)
tc := newTidbClusterForPD()
tc.Spec.TiKV.MaxFailoverCount = 3
test.update(tc)
tikvFailover := newFakeTiKVFailover()

Expand Down Expand Up @@ -138,6 +139,121 @@ func TestTiKVFailoverFailover(t *testing.T) {
g.Expect(len(tc.Status.TiKV.FailureStores)).To(Equal(1))
},
},
{
name: "not exceed max failover count",
update: func(tc *v1alpha1.TidbCluster) {
tc.Status.TiKV.Stores = map[string]v1alpha1.TiKVStore{
"3": {
State: v1alpha1.TiKVStateDown,
PodName: "tikv-3",
LastTransitionTime: metav1.Time{Time: time.Now().Add(-70 * time.Minute)},
},
"10": {
State: v1alpha1.TiKVStateUp,
PodName: "tikv-10",
LastTransitionTime: metav1.Time{Time: time.Now().Add(-70 * time.Minute)},
},
"11": {
State: v1alpha1.TiKVStateUp,
PodName: "tikv-11",
LastTransitionTime: metav1.Time{Time: time.Now().Add(-61 * time.Minute)},
},
}
tc.Status.TiKV.FailureStores = map[string]v1alpha1.TiKVFailureStore{
"1": {
PodName: "tikv-1",
StoreID: "1",
},
"2": {
PodName: "tikv-2",
StoreID: "2",
},
}
},
err: false,
expectFn: func(tc *v1alpha1.TidbCluster) {
g.Expect(int(tc.Spec.TiKV.Replicas)).To(Equal(3))
g.Expect(len(tc.Status.TiKV.FailureStores)).To(Equal(3))
},
},
{
name: "exceed max failover count1",
update: func(tc *v1alpha1.TidbCluster) {
tc.Status.TiKV.Stores = map[string]v1alpha1.TiKVStore{
"3": {
State: v1alpha1.TiKVStateDown,
PodName: "tikv-3",
LastTransitionTime: metav1.Time{Time: time.Now().Add(-70 * time.Minute)},
},
"10": {
State: v1alpha1.TiKVStateDown,
PodName: "tikv-10",
LastTransitionTime: metav1.Time{Time: time.Now().Add(-70 * time.Minute)},
},
"11": {
State: v1alpha1.TiKVStateUp,
PodName: "tikv-11",
LastTransitionTime: metav1.Time{Time: time.Now().Add(-61 * time.Minute)},
},
}
tc.Status.TiKV.FailureStores = map[string]v1alpha1.TiKVFailureStore{
"1": {
PodName: "tikv-1",
StoreID: "1",
},
"2": {
PodName: "tikv-2",
StoreID: "2",
},
}
},
err: false,
expectFn: func(tc *v1alpha1.TidbCluster) {
g.Expect(int(tc.Spec.TiKV.Replicas)).To(Equal(3))
g.Expect(len(tc.Status.TiKV.FailureStores)).To(Equal(3))
},
},
{
name: "exceed max failover count2",
update: func(tc *v1alpha1.TidbCluster) {
tc.Status.TiKV.Stores = map[string]v1alpha1.TiKVStore{
"12": {
State: v1alpha1.TiKVStateDown,
PodName: "tikv-12",
LastTransitionTime: metav1.Time{Time: time.Now().Add(-70 * time.Minute)},
},
"13": {
State: v1alpha1.TiKVStateDown,
PodName: "tikv-13",
LastTransitionTime: metav1.Time{Time: time.Now().Add(-61 * time.Minute)},
},
"14": {
State: v1alpha1.TiKVStateDown,
PodName: "tikv-14",
LastTransitionTime: metav1.Time{Time: time.Now().Add(-70 * time.Minute)},
},
}
tc.Status.TiKV.FailureStores = map[string]v1alpha1.TiKVFailureStore{
"1": {
PodName: "tikv-1",
StoreID: "1",
},
"2": {
PodName: "tikv-2",
StoreID: "2",
},
"3": {
PodName: "tikv-3",
StoreID: "3",
},
}
},
err: false,
expectFn: func(tc *v1alpha1.TidbCluster) {
g.Expect(int(tc.Spec.TiKV.Replicas)).To(Equal(3))
g.Expect(len(tc.Status.TiKV.FailureStores)).To(Equal(3))
},
},
}
for i := range tests {
testFn(&tests[i], t)
Expand Down

0 comments on commit f91fc58

Please sign in to comment.