Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add maxFailoverCount limit to TiKV #965

Merged
merged 6 commits into from
Sep 30, 2019
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions charts/tidb-cluster/templates/tidb-cluster.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,7 @@ spec:
{{- if .Values.tikv.priorityClassName }}
priorityClassName: {{ .Values.tikv.priorityClassName }}
{{- end }}
maxFailoverCount: {{ .Values.tikv.maxFailoverCount | default 3 }}
tidb:
replicas: {{ .Values.tidb.replicas }}
image: {{ .Values.tidb.image }}
Expand Down
1 change: 1 addition & 0 deletions charts/tidb-cluster/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -261,6 +261,7 @@ tikv:
# Specify the priorityClassName for TiKV Pod.
# refer to https://kubernetes.io/docs/concepts/configuration/pod-priority-preemption/#how-to-use-priority-and-preemption
priorityClassName: ""
maxFailoverCount: 3
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggest to add comments here to make it clear what's the detailed meaning and maybe how to configure the value, e.g. for the cluster with 100 TiKV, what's the suggested config.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done.


tidb:
# Please refer to https://github.com/pingcap/tidb/blob/master/config/config.toml.example for the default
Expand Down
4 changes: 2 additions & 2 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -121,8 +121,8 @@ require (
k8s.io/apiserver v0.0.0-20190118115647-a748535592ba
k8s.io/cli-runtime v0.0.0-20190118125240-caee4253d968
k8s.io/client-go v2.0.0-alpha.0.0.20190115164855-701b91367003+incompatible
k8s.io/code-generator v0.0.0-20190912042602-ebc0eb3a5c23
k8s.io/klog v0.4.0
k8s.io/code-generator v0.0.0-20190927075303-016f2b3d74d0
k8s.io/klog v1.0.0
k8s.io/kubernetes v1.12.5
k8s.io/metrics v0.0.0-20190118124808-33c1aed8dc65 // indirect
k8s.io/utils v0.0.0-20190308190857-21c4ce38f2a7 // indirect
Expand Down
8 changes: 4 additions & 4 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -377,14 +377,14 @@ k8s.io/cli-runtime v0.0.0-20190118125240-caee4253d968 h1:VXLj8aMvJEo14Utv+knJDs0
k8s.io/cli-runtime v0.0.0-20190118125240-caee4253d968/go.mod h1:qWnH3/b8sp/l7EvlDh7ulDU3UWA4P4N1NFbEEP791tM=
k8s.io/client-go v2.0.0-alpha.0.0.20190115164855-701b91367003+incompatible h1:Qw/ADzXV2yX+39UUCwNcZmdNS4+sR+V2Jf9NBdZWlQg=
k8s.io/client-go v2.0.0-alpha.0.0.20190115164855-701b91367003+incompatible/go.mod h1:7vJpHMYJwNQCWgzmNV+VYUl1zCObLyodBc8nIyt8L5s=
k8s.io/code-generator v0.0.0-20190912042602-ebc0eb3a5c23 h1:2oyDSO/D/4/bch5ZhL+sF5CPxO0GMrXhsIKFFOV6/uo=
k8s.io/code-generator v0.0.0-20190912042602-ebc0eb3a5c23/go.mod h1:V5BD6M4CyaN5m+VthcclXWsVcT1Hu+glwa1bi3MIsyE=
k8s.io/code-generator v0.0.0-20190927075303-016f2b3d74d0 h1:rhwEVFHoBm42V0b7yN9SUdbWzfCVndLzRV8YGIi0uWY=
k8s.io/code-generator v0.0.0-20190927075303-016f2b3d74d0/go.mod h1:4MfOrxyyZxxCuenwsdaJRtoSnOP5T13jE2LRYPZ6KeY=
k8s.io/gengo v0.0.0-20190128074634-0689ccc1d7d6/go.mod h1:ezvh/TsK7cY6rbqRK0oQQ8IAqLxYwwyPxAX1Pzy0ii0=
k8s.io/gengo v0.0.0-20190822140433-26a664648505 h1:ZY6yclUKVbZ+SdWnkfY+Je5vrMpKOxmGeKRbsXVmqYM=
k8s.io/gengo v0.0.0-20190822140433-26a664648505/go.mod h1:ezvh/TsK7cY6rbqRK0oQQ8IAqLxYwwyPxAX1Pzy0ii0=
k8s.io/klog v0.0.0-20181102134211-b9b56d5dfc92/go.mod h1:Gq+BEi5rUBO/HRz0bTSXDUcqjScdoY3a9IHpCEIOOfk=
k8s.io/klog v0.4.0 h1:lCJCxf/LIowc2IGS9TPjWDyXY4nOmdGdfcwwDQCOURQ=
k8s.io/klog v0.4.0/go.mod h1:4Bi6QPql/J/LkTDqv7R/cd3hPo4k2DG6Ptcz060Ez5I=
k8s.io/klog v1.0.0 h1:Pt+yjF5aB1xDSVbau4VsWe+dQNzA0qv1LlXdC2dF6Q8=
k8s.io/klog v1.0.0/go.mod h1:4Bi6QPql/J/LkTDqv7R/cd3hPo4k2DG6Ptcz060Ez5I=
k8s.io/kube-openapi v0.0.0-20190816220812-743ec37842bf h1:EYm5AW/UUDbnmnI+gK0TJDVK9qPLhM+sRHYanNKw0EQ=
k8s.io/kube-openapi v0.0.0-20190816220812-743ec37842bf/go.mod h1:1TqjTSzOxsLGIKfj0lK8EeCP7K1iUG65v09OM0/WG5E=
k8s.io/kubernetes v1.12.5 h1:pdQvCJZPGRNVS3CaajKuoPCZKreQaglbRcXwkDwR598=
Expand Down
1 change: 1 addition & 0 deletions images/tidb-operator-e2e/tidb-cluster-values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,7 @@ tikv:
# value: tidb
# effect: "NoSchedule"
annotations: {}
maxFailoverCount: 3

tikvPromGateway:
image: prom/pushgateway:v0.3.1
Expand Down
1 change: 1 addition & 0 deletions pkg/apis/pingcap.com/v1alpha1/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -138,6 +138,7 @@ type TiKVSpec struct {
Replicas int32 `json:"replicas"`
Privileged bool `json:"privileged,omitempty"`
StorageClassName string `json:"storageClassName,omitempty"`
MaxFailoverCount int32 `json:"maxFailoverCount,omitempty"`
}

// TiKVPromGatewaySpec runs as a sidecar with TiKVSpec
Expand Down
8 changes: 8 additions & 0 deletions pkg/manager/member/tikv_failover.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ package member
import (
"time"

"github.com/golang/glog"
"github.com/pingcap/tidb-operator/pkg/apis/pingcap.com/v1alpha1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
)
Expand All @@ -30,6 +31,13 @@ func NewTiKVFailover(tikvFailoverPeriod time.Duration) Failover {
}

func (tf *tikvFailover) Failover(tc *v1alpha1.TidbCluster) error {
ns := tc.GetNamespace()
tcName := tc.GetName()
if len(tc.Status.TiKV.FailureStores) >= int(tc.Spec.TiKV.MaxFailoverCount) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

IIUC, If multiple tikv-servers become Down simultaneously, the MaxFailoverCount may be exceeded.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good catch!

Addressed PTAL again.

glog.Errorf("%s/%s failure stores count reached the limit: %d", ns, tcName, tc.Spec.TiKV.MaxFailoverCount)
return nil
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The return is nil, so is it more appropriate to output log with warning level?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

IIRC, tidb_failover.go also needs to be modified.

}

for storeID, store := range tc.Status.TiKV.Stores {
podName := store.PodName
if store.LastTransitionTime.IsZero() {
Expand Down
79 changes: 79 additions & 0 deletions pkg/manager/member/tikv_failover_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ func TestTiKVFailoverFailover(t *testing.T) {
testFn := func(test *testcase, t *testing.T) {
t.Log(test.name)
tc := newTidbClusterForPD()
tc.Spec.TiKV.MaxFailoverCount = 3
test.update(tc)
tikvFailover := newFakeTiKVFailover()

Expand Down Expand Up @@ -138,6 +139,84 @@ func TestTiKVFailoverFailover(t *testing.T) {
g.Expect(len(tc.Status.TiKV.FailureStores)).To(Equal(1))
},
},
{
name: "not exceed max failover count",
update: func(tc *v1alpha1.TidbCluster) {
tc.Status.TiKV.Stores = map[string]v1alpha1.TiKVStore{
"3": {
State: v1alpha1.TiKVStateDown,
PodName: "tikv-3",
LastTransitionTime: metav1.Time{Time: time.Now().Add(-70 * time.Minute)},
},
"10": {
State: v1alpha1.TiKVStateUp,
PodName: "tikv-10",
LastTransitionTime: metav1.Time{Time: time.Now().Add(-70 * time.Minute)},
},
"11": {
State: v1alpha1.TiKVStateUp,
PodName: "tikv-11",
LastTransitionTime: metav1.Time{Time: time.Now().Add(-61 * time.Minute)},
},
}
tc.Status.TiKV.FailureStores = map[string]v1alpha1.TiKVFailureStore{
"1": {
PodName: "tikv-1",
StoreID: "1",
},
"2": {
PodName: "tikv-2",
StoreID: "2",
},
}
},
err: false,
expectFn: func(tc *v1alpha1.TidbCluster) {
g.Expect(int(tc.Spec.TiKV.Replicas)).To(Equal(3))
g.Expect(len(tc.Status.TiKV.FailureStores)).To(Equal(3))
},
},
{
name: "exceed max failover count",
update: func(tc *v1alpha1.TidbCluster) {
tc.Status.TiKV.Stores = map[string]v1alpha1.TiKVStore{
"12": {
State: v1alpha1.TiKVStateDown,
PodName: "tikv-12",
LastTransitionTime: metav1.Time{Time: time.Now().Add(-70 * time.Minute)},
},
"13": {
State: v1alpha1.TiKVStateDown,
PodName: "tikv-13",
LastTransitionTime: metav1.Time{Time: time.Now().Add(-61 * time.Minute)},
},
"14": {
State: v1alpha1.TiKVStateDown,
PodName: "tikv-14",
LastTransitionTime: metav1.Time{Time: time.Now().Add(-70 * time.Minute)},
},
}
tc.Status.TiKV.FailureStores = map[string]v1alpha1.TiKVFailureStore{
"1": {
PodName: "tikv-1",
StoreID: "1",
},
"2": {
PodName: "tikv-2",
StoreID: "2",
},
"3": {
PodName: "tikv-3",
StoreID: "3",
},
}
},
err: false,
expectFn: func(tc *v1alpha1.TidbCluster) {
g.Expect(int(tc.Spec.TiKV.Replicas)).To(Equal(3))
g.Expect(len(tc.Status.TiKV.FailureStores)).To(Equal(3))
},
},
}
for i := range tests {
testFn(&tests[i], t)
Expand Down