From 742e75a4843c07e40218eb6ee2ba486761abe24d Mon Sep 17 00:00:00 2001 From: weekface Date: Wed, 17 Jul 2019 15:53:24 +0800 Subject: [PATCH 1/6] * stop kubelet * pv data safe --- tests/actions.go | 79 +++++++++++++++++++++++++++++++++++-- tests/cmd/stability/main.go | 16 +++++++- tests/failover.go | 30 ++++++++++++-- tests/fault.go | 77 +++++++++++++++++++++++++++++------- 4 files changed, 178 insertions(+), 24 deletions(-) diff --git a/tests/actions.go b/tests/actions.go index bc8bc06820..01d6466ec1 100644 --- a/tests/actions.go +++ b/tests/actions.go @@ -25,6 +25,7 @@ import ( "os" "os/exec" "path/filepath" + "reflect" "strconv" "strings" "sync" @@ -159,7 +160,8 @@ type OperatorActions interface { CheckTidbClustersAvailable(infos []*TidbClusterConfig) error CheckOperatorDownOrDie(infos []*TidbClusterConfig) CheckTidbClustersAvailableOrDie(infos []*TidbClusterConfig) - CheckOneEtcdDownOrDie(operatorConfig *OperatorConfig, clusters []*TidbClusterConfig, faultNode string) + CheckEtcdDownOrDie(operatorConfig *OperatorConfig, clusters []*TidbClusterConfig, faultNode string) + CheckKubeletDownOrDie(operatorConfig *OperatorConfig, clusters []*TidbClusterConfig, faultNode string) CheckOneApiserverDownOrDie(operatorConfig *OperatorConfig, clusters []*TidbClusterConfig, faultNode string) CheckKubeProxyDownOrDie(operatorConfig *OperatorConfig, clusters []*TidbClusterConfig) CheckKubeSchedulerDownOrDie(operatorConfig *OperatorConfig, clusters []*TidbClusterConfig) @@ -545,6 +547,43 @@ func (oa *operatorActions) DeployTidbClusterOrDie(info *TidbClusterConfig) { func (oa *operatorActions) CleanTidbCluster(info *TidbClusterConfig) error { glog.Infof("cleaning tidbcluster %s/%s", info.Namespace, info.ClusterName) oa.EmitEvent(info, "CleanTidbCluster") + ns := info.Namespace + tcName := info.ClusterName + + selector, err := metav1.LabelSelectorAsSelector(&metav1.LabelSelector{ + MatchLabels: map[string]string{ + label.InstanceLabelKey: tcName, + }, + MatchExpressions: []metav1.LabelSelectorRequirement{ + { + Key: label.ComponentLabelKey, + Operator: metav1.LabelSelectorOpIn, + Values: []string{label.PDLabelVal, label.TiKVLabelVal}, + }, + }, + }) + if err != nil { + return err + } + pvcList, err := oa.kubeCli.CoreV1().PersistentVolumeClaims(ns).List(metav1.ListOptions{LabelSelector: selector.String()}) + if err != nil { + return err + } + var beforePVCNames []string + for _, pvc := range pvcList.Items { + beforePVCNames = append(beforePVCNames, pvc.GetName()) + } + glog.V(4).Info(beforePVCNames) + + pvList, err := oa.kubeCli.CoreV1().PersistentVolumes().List(metav1.ListOptions{LabelSelector: selector.String()}) + if err != nil { + return err + } + var beforePVNames []string + for _, pv := range pvList.Items { + beforePVNames = append(beforePVNames, pv.GetName()) + } + glog.V(4).Info(beforePVNames) charts := []string{ info.ClusterName, @@ -560,7 +599,38 @@ func (oa *operatorActions) CleanTidbCluster(info *TidbClusterConfig) error { } } - err := oa.kubeCli.CoreV1().Pods(info.Namespace).Delete(getBackupDirPodName, &metav1.DeleteOptions{}) + time.Sleep(time.Minute) + + pvcList, err = oa.kubeCli.CoreV1().PersistentVolumeClaims(ns).List(metav1.ListOptions{LabelSelector: selector.String()}) + if err != nil { + return err + } + var afterPVCNames []string + for _, pvc := range pvcList.Items { + afterPVCNames = append(afterPVCNames, pvc.GetName()) + } + glog.V(4).Info(afterPVCNames) + + pvList, err = oa.kubeCli.CoreV1().PersistentVolumes().List(metav1.ListOptions{LabelSelector: selector.String()}) + if err != nil { + return err + } + var afterPVNames []string + for _, pv := range pvList.Items { + afterPVNames = append(afterPVNames, pv.GetName()) + } + glog.V(4).Info(afterPVNames) + + if !reflect.DeepEqual(beforePVCNames, afterPVCNames) { + return fmt.Errorf("pvc changed when we delete cluster: %s/%s, before: %v, after: %v", + ns, tcName, beforePVCNames, afterPVCNames) + } + if !reflect.DeepEqual(beforePVNames, afterPVNames) { + return fmt.Errorf("pv changed when we delete cluster: %s/%s, before: %v, after: %v", + ns, tcName, beforePVNames, afterPVNames) + } + + err = oa.kubeCli.CoreV1().Pods(info.Namespace).Delete(getBackupDirPodName, &metav1.DeleteOptions{}) if err != nil && !errors.IsNotFound(err) { return fmt.Errorf("failed to delete dir pod %v", err) } @@ -950,7 +1020,8 @@ func (oa *operatorActions) CheckUpgrade(ctx context.Context, info *TidbClusterCo } glog.V(4).Infof("index:%d,schedulers:%v,error:%v", i, schedulers, err) if len(schedulers) > 1 { - return true, fmt.Errorf("there are too many evict leader schedulers: %v", schedulers) + glog.Errorf("there are too many evict leader schedulers: %v", schedulers) + return false, nil } if len(schedulers) == 0 { return false, nil @@ -1114,7 +1185,7 @@ func (oa *operatorActions) tikvMembersReadyFn(tc *v1alpha1.TidbCluster) (bool, e } if len(tc.Status.TiKV.Stores) != int(replicas) { glog.Infof("tidbcluster: %s/%s .status.TiKV.Stores.count(%d) != %d", - ns, tcName, len(tc.Status.TiKV.Stores), tc.Spec.TiKV.Replicas) + ns, tcName, len(tc.Status.TiKV.Stores), replicas) return false, nil } if tikvSet.Status.ReadyReplicas != tikvSet.Status.Replicas { diff --git a/tests/cmd/stability/main.go b/tests/cmd/stability/main.go index cc22181a32..90618aa670 100644 --- a/tests/cmd/stability/main.go +++ b/tests/cmd/stability/main.go @@ -231,14 +231,26 @@ func run() { // truncate tikv sst file oa.TruncateSSTFileThenCheckFailoverOrDie(clusters[0], 5*time.Minute) - // stop etcd + // stop one etcd faultEtcd := tests.SelectNode(cfg.ETCDs) fta.StopETCDOrDie(faultEtcd) defer fta.StartETCDOrDie(faultEtcd) time.Sleep(3 * time.Minute) - oa.CheckOneEtcdDownOrDie(ocfg, deployedClusters, faultEtcd) + oa.CheckEtcdDownOrDie(ocfg, deployedClusters, faultEtcd) fta.StartETCDOrDie(faultEtcd) + // stop all etcds + fta.StopETCDOrDie() + time.Sleep(10 * time.Minute) + fta.StartETCDOrDie() + oa.CheckEtcdDownOrDie(ocfg, deployedClusters, "") + + // stop all kubelets + fta.StopKubeletOrDie() + time.Sleep(10 * time.Minute) + fta.StartKubeletOrDie() + oa.CheckKubeletDownOrDie(ocfg, deployedClusters, "") + // stop all kube-proxy and k8s/operator/tidbcluster is available fta.StopKubeProxyOrDie() oa.CheckKubeProxyDownOrDie(ocfg, clusters) diff --git a/tests/failover.go b/tests/failover.go index 193f400f4b..83f20d07ad 100644 --- a/tests/failover.go +++ b/tests/failover.go @@ -173,7 +173,7 @@ func (oa *operatorActions) CheckFailoverPending(info *TidbClusterConfig, node st if _, exist := affectedPods[failureStore.PodName]; exist { err := fmt.Errorf("cluster: [%s] the tikv store[%s] should be mark failure after %s", info.FullName(), failureStore.PodName, deadline.Format(time.RFC3339)) glog.Errorf(err.Error()) - return false, err + return false, nil } } @@ -472,8 +472,32 @@ func (oa *operatorActions) GetNodeMap(info *TidbClusterConfig, component string) return nodeMap, nil } -func (oa *operatorActions) CheckOneEtcdDownOrDie(operatorConfig *OperatorConfig, clusters []*TidbClusterConfig, faultNode string) { - glog.Infof("check k8s/operator/tidbCluster status when one etcd down") +func (oa *operatorActions) CheckKubeletDownOrDie(operatorConfig *OperatorConfig, clusters []*TidbClusterConfig, faultNode string) { + glog.Infof("check k8s/operator/tidbCluster status when kubelet down") + KeepOrDie(3*time.Second, 10*time.Minute, func() error { + err := oa.CheckK8sAvailable(nil, nil) + if err != nil { + return err + } + glog.V(4).Infof("k8s cluster is available.") + err = oa.CheckOperatorAvailable(operatorConfig) + if err != nil { + return err + } + glog.V(4).Infof("tidb operator is available.") + err = oa.CheckTidbClustersAvailable(clusters) + if err != nil { + return err + } + glog.V(4).Infof("all clusters are available") + return nil + }) +} + +func (oa *operatorActions) CheckEtcdDownOrDie(operatorConfig *OperatorConfig, clusters []*TidbClusterConfig, faultNode string) { + glog.Infof("check k8s/operator/tidbCluster status when etcd down") + // kube-apiserver may block 15 min + time.Sleep(20 * time.Minute) KeepOrDie(3*time.Second, 10*time.Minute, func() error { err := oa.CheckK8sAvailable(nil, nil) if err != nil { diff --git a/tests/fault.go b/tests/fault.go index 23ea8b5d3f..a6ba6f37e0 100644 --- a/tests/fault.go +++ b/tests/fault.go @@ -4,6 +4,7 @@ import ( "fmt" "math/rand" "os" + "sync" "time" "github.com/pingcap/tidb-operator/tests/slack" @@ -37,8 +38,10 @@ type FaultTriggerActions interface { StopETCDOrDie(nodes ...string) StartETCD(nodes ...string) error StartETCDOrDie(nodes ...string) - StopKubelet(node string) error - StartKubelet(node string) error + StopKubelet(nodes ...string) error + StopKubeletOrDie(nodes ...string) + StartKubelet(nodes ...string) error + StartKubeletOrDie(nodes ...string) StopKubeAPIServer(node string) error StopKubeAPIServerOrDie(node string) StartKubeAPIServer(node string) error @@ -333,22 +336,22 @@ func (fa *faultTriggerActions) StopETCD(nodes ...string) error { } func (fa *faultTriggerActions) StopETCDOrDie(nodes ...string) { + glog.Infof("stopping %v etcds", nodes) if err := fa.StopETCD(nodes...); err != nil { slack.NotifyAndPanic(err) } } -// StartETCD starts the etcd service. -// If the `nodes` is empty, StartETCD will start all etcd service. -func (fa *faultTriggerActions) StartETCD(nodes ...string) error { +// StopKubelet stops the kubelet service. +func (fa *faultTriggerActions) StopKubelet(nodes ...string) error { if len(nodes) == 0 { - for _, ns := range fa.cfg.ETCDs { + for _, ns := range fa.cfg.Nodes { nodes = append(nodes, ns.Nodes...) } } for _, node := range nodes { - if err := fa.serviceAction(node, manager.ETCDService, startAction); err != nil { + if err := fa.serviceAction(node, manager.KubeletService, stopAction); err != nil { return err } } @@ -356,20 +359,64 @@ func (fa *faultTriggerActions) StartETCD(nodes ...string) error { return nil } -func (fa *faultTriggerActions) StartETCDOrDie(nodes ...string) { - if err := fa.StartETCD(nodes...); err != nil { +func (fa *faultTriggerActions) StopKubeletOrDie(nodes ...string) { + glog.Infof("stopping %v kubelets", nodes) + if err := fa.StopKubelet(nodes...); err != nil { slack.NotifyAndPanic(err) } } -// StopKubelet stops the kubelet service. -func (fa *faultTriggerActions) StopKubelet(node string) error { - return fa.serviceAction(node, manager.KubeletService, stopAction) +// StartKubelet starts the kubelet service. +func (fa *faultTriggerActions) StartKubelet(nodes ...string) error { + if len(nodes) == 0 { + for _, ns := range fa.cfg.Nodes { + nodes = append(nodes, ns.Nodes...) + } + } + + for _, node := range nodes { + if err := fa.serviceAction(node, manager.KubeletService, startAction); err != nil { + return err + } + } + + return nil } -// StartKubelet starts the kubelet service. -func (fa *faultTriggerActions) StartKubelet(node string) error { - return fa.serviceAction(node, manager.KubeletService, startAction) +func (fa *faultTriggerActions) StartKubeletOrDie(nodes ...string) { + if err := fa.StartKubelet(nodes...); err != nil { + slack.NotifyAndPanic(err) + } +} + +// StartETCD starts the etcd service. +// If the `nodes` is empty, StartETCD will start all etcd service. +func (fa *faultTriggerActions) StartETCD(nodes ...string) error { + if len(nodes) == 0 { + for _, ns := range fa.cfg.ETCDs { + nodes = append(nodes, ns.Nodes...) + } + } + + var wg sync.WaitGroup + for _, node := range nodes { + wg.Add(1) + go func(n string) { + defer wg.Done() + if err := fa.serviceAction(n, manager.ETCDService, startAction); err != nil { + slack.NotifyAndPanic(fmt.Errorf("failed to start %s etcd, %v", n, err)) + } + }(node) + } + wg.Wait() + + return nil +} + +func (fa *faultTriggerActions) StartETCDOrDie(nodes ...string) { + if err := fa.StartETCD(nodes...); err != nil { + slack.NotifyAndPanic(err) + } } // StopKubeScheduler stops the kube-scheduler service. From 2aff2eafb6ae17e6b425174fb45b6b6a7c331021 Mon Sep 17 00:00:00 2001 From: weekface Date: Thu, 18 Jul 2019 11:50:59 +0800 Subject: [PATCH 2/6] fix check operator --- tests/failover.go | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/tests/failover.go b/tests/failover.go index 83f20d07ad..b0ed58c789 100644 --- a/tests/failover.go +++ b/tests/failover.go @@ -474,6 +474,7 @@ func (oa *operatorActions) GetNodeMap(info *TidbClusterConfig, component string) func (oa *operatorActions) CheckKubeletDownOrDie(operatorConfig *OperatorConfig, clusters []*TidbClusterConfig, faultNode string) { glog.Infof("check k8s/operator/tidbCluster status when kubelet down") + time.Sleep(10 * time.Minute) KeepOrDie(3*time.Second, 10*time.Minute, func() error { err := oa.CheckK8sAvailable(nil, nil) if err != nil { @@ -711,14 +712,22 @@ func (oa *operatorActions) CheckK8sAvailable(excludeNodes map[string]string, exc } func (oa *operatorActions) CheckOperatorAvailable(operatorConfig *OperatorConfig) error { - return wait.Poll(3*time.Second, 3*time.Minute, func() (bool, error) { + var errCount int + var e error + return wait.Poll(10*time.Second, 3*time.Minute, func() (bool, error) { + if errCount >= 10 { + return true, e + } controllerDeployment, err := oa.kubeCli.AppsV1().Deployments(operatorConfig.Namespace).Get(tidbControllerName, metav1.GetOptions{}) if err != nil { glog.Errorf("failed to get deployment:%s failed,error:%v", tidbControllerName, err) return false, nil } if controllerDeployment.Status.AvailableReplicas != *controllerDeployment.Spec.Replicas { - return false, fmt.Errorf("the %s is not available", tidbControllerName) + e = fmt.Errorf("the %s is not available", tidbControllerName) + glog.Error(e) + errCount++ + return false, nil } schedulerDeployment, err := oa.kubeCli.AppsV1().Deployments(operatorConfig.Namespace).Get(tidbSchedulerName, metav1.GetOptions{}) if err != nil { @@ -726,7 +735,10 @@ func (oa *operatorActions) CheckOperatorAvailable(operatorConfig *OperatorConfig return false, nil } if schedulerDeployment.Status.AvailableReplicas != *schedulerDeployment.Spec.Replicas { - return false, fmt.Errorf("the %s is not available", tidbSchedulerName) + e = fmt.Errorf("the %s is not available", tidbSchedulerName) + glog.Error(e) + errCount++ + return false, nil } return true, nil }) From 9c986541df05a987744c6e84b9586b6bfe54214c Mon Sep 17 00:00:00 2001 From: weekface Date: Wed, 17 Jul 2019 15:53:24 +0800 Subject: [PATCH 3/6] * stop kubelet * pv data safe --- tests/actions.go | 79 +++++++++++++++++++++++++++++++++++-- tests/cmd/stability/main.go | 16 +++++++- tests/failover.go | 30 ++++++++++++-- tests/fault.go | 77 +++++++++++++++++++++++++++++------- 4 files changed, 178 insertions(+), 24 deletions(-) diff --git a/tests/actions.go b/tests/actions.go index 881a403926..ceb83ffa94 100644 --- a/tests/actions.go +++ b/tests/actions.go @@ -25,6 +25,7 @@ import ( "os" "os/exec" "path/filepath" + "reflect" "strconv" "strings" "sync" @@ -159,7 +160,8 @@ type OperatorActions interface { CheckTidbClustersAvailable(infos []*TidbClusterConfig) error CheckOperatorDownOrDie(infos []*TidbClusterConfig) CheckTidbClustersAvailableOrDie(infos []*TidbClusterConfig) - CheckOneEtcdDownOrDie(operatorConfig *OperatorConfig, clusters []*TidbClusterConfig, faultNode string) + CheckEtcdDownOrDie(operatorConfig *OperatorConfig, clusters []*TidbClusterConfig, faultNode string) + CheckKubeletDownOrDie(operatorConfig *OperatorConfig, clusters []*TidbClusterConfig, faultNode string) CheckOneApiserverDownOrDie(operatorConfig *OperatorConfig, clusters []*TidbClusterConfig, faultNode string) CheckKubeProxyDownOrDie(operatorConfig *OperatorConfig, clusters []*TidbClusterConfig) CheckKubeSchedulerDownOrDie(operatorConfig *OperatorConfig, clusters []*TidbClusterConfig) @@ -523,6 +525,43 @@ func (oa *operatorActions) DeployTidbClusterOrDie(info *TidbClusterConfig) { func (oa *operatorActions) CleanTidbCluster(info *TidbClusterConfig) error { glog.Infof("cleaning tidbcluster %s/%s", info.Namespace, info.ClusterName) oa.EmitEvent(info, "CleanTidbCluster") + ns := info.Namespace + tcName := info.ClusterName + + selector, err := metav1.LabelSelectorAsSelector(&metav1.LabelSelector{ + MatchLabels: map[string]string{ + label.InstanceLabelKey: tcName, + }, + MatchExpressions: []metav1.LabelSelectorRequirement{ + { + Key: label.ComponentLabelKey, + Operator: metav1.LabelSelectorOpIn, + Values: []string{label.PDLabelVal, label.TiKVLabelVal}, + }, + }, + }) + if err != nil { + return err + } + pvcList, err := oa.kubeCli.CoreV1().PersistentVolumeClaims(ns).List(metav1.ListOptions{LabelSelector: selector.String()}) + if err != nil { + return err + } + var beforePVCNames []string + for _, pvc := range pvcList.Items { + beforePVCNames = append(beforePVCNames, pvc.GetName()) + } + glog.V(4).Info(beforePVCNames) + + pvList, err := oa.kubeCli.CoreV1().PersistentVolumes().List(metav1.ListOptions{LabelSelector: selector.String()}) + if err != nil { + return err + } + var beforePVNames []string + for _, pv := range pvList.Items { + beforePVNames = append(beforePVNames, pv.GetName()) + } + glog.V(4).Info(beforePVNames) charts := []string{ info.ClusterName, @@ -538,7 +577,38 @@ func (oa *operatorActions) CleanTidbCluster(info *TidbClusterConfig) error { } } - err := oa.kubeCli.CoreV1().Pods(info.Namespace).Delete(getBackupDirPodName, &metav1.DeleteOptions{}) + time.Sleep(time.Minute) + + pvcList, err = oa.kubeCli.CoreV1().PersistentVolumeClaims(ns).List(metav1.ListOptions{LabelSelector: selector.String()}) + if err != nil { + return err + } + var afterPVCNames []string + for _, pvc := range pvcList.Items { + afterPVCNames = append(afterPVCNames, pvc.GetName()) + } + glog.V(4).Info(afterPVCNames) + + pvList, err = oa.kubeCli.CoreV1().PersistentVolumes().List(metav1.ListOptions{LabelSelector: selector.String()}) + if err != nil { + return err + } + var afterPVNames []string + for _, pv := range pvList.Items { + afterPVNames = append(afterPVNames, pv.GetName()) + } + glog.V(4).Info(afterPVNames) + + if !reflect.DeepEqual(beforePVCNames, afterPVCNames) { + return fmt.Errorf("pvc changed when we delete cluster: %s/%s, before: %v, after: %v", + ns, tcName, beforePVCNames, afterPVCNames) + } + if !reflect.DeepEqual(beforePVNames, afterPVNames) { + return fmt.Errorf("pv changed when we delete cluster: %s/%s, before: %v, after: %v", + ns, tcName, beforePVNames, afterPVNames) + } + + err = oa.kubeCli.CoreV1().Pods(info.Namespace).Delete(getBackupDirPodName, &metav1.DeleteOptions{}) if err != nil && !errors.IsNotFound(err) { return fmt.Errorf("failed to delete dir pod %v", err) } @@ -934,7 +1004,8 @@ func (oa *operatorActions) CheckUpgrade(ctx context.Context, info *TidbClusterCo } glog.V(4).Infof("index:%d,schedulers:%v,error:%v", i, schedulers, err) if len(schedulers) > 1 { - return true, fmt.Errorf("there are too many evict leader schedulers: %v", schedulers) + glog.Errorf("there are too many evict leader schedulers: %v", schedulers) + return false, nil } if len(schedulers) == 0 { return false, nil @@ -1098,7 +1169,7 @@ func (oa *operatorActions) tikvMembersReadyFn(tc *v1alpha1.TidbCluster) (bool, e } if len(tc.Status.TiKV.Stores) != int(replicas) { glog.Infof("tidbcluster: %s/%s .status.TiKV.Stores.count(%d) != %d", - ns, tcName, len(tc.Status.TiKV.Stores), tc.Spec.TiKV.Replicas) + ns, tcName, len(tc.Status.TiKV.Stores), replicas) return false, nil } if tikvSet.Status.ReadyReplicas != tikvSet.Status.Replicas { diff --git a/tests/cmd/stability/main.go b/tests/cmd/stability/main.go index cc22181a32..90618aa670 100644 --- a/tests/cmd/stability/main.go +++ b/tests/cmd/stability/main.go @@ -231,14 +231,26 @@ func run() { // truncate tikv sst file oa.TruncateSSTFileThenCheckFailoverOrDie(clusters[0], 5*time.Minute) - // stop etcd + // stop one etcd faultEtcd := tests.SelectNode(cfg.ETCDs) fta.StopETCDOrDie(faultEtcd) defer fta.StartETCDOrDie(faultEtcd) time.Sleep(3 * time.Minute) - oa.CheckOneEtcdDownOrDie(ocfg, deployedClusters, faultEtcd) + oa.CheckEtcdDownOrDie(ocfg, deployedClusters, faultEtcd) fta.StartETCDOrDie(faultEtcd) + // stop all etcds + fta.StopETCDOrDie() + time.Sleep(10 * time.Minute) + fta.StartETCDOrDie() + oa.CheckEtcdDownOrDie(ocfg, deployedClusters, "") + + // stop all kubelets + fta.StopKubeletOrDie() + time.Sleep(10 * time.Minute) + fta.StartKubeletOrDie() + oa.CheckKubeletDownOrDie(ocfg, deployedClusters, "") + // stop all kube-proxy and k8s/operator/tidbcluster is available fta.StopKubeProxyOrDie() oa.CheckKubeProxyDownOrDie(ocfg, clusters) diff --git a/tests/failover.go b/tests/failover.go index 193f400f4b..83f20d07ad 100644 --- a/tests/failover.go +++ b/tests/failover.go @@ -173,7 +173,7 @@ func (oa *operatorActions) CheckFailoverPending(info *TidbClusterConfig, node st if _, exist := affectedPods[failureStore.PodName]; exist { err := fmt.Errorf("cluster: [%s] the tikv store[%s] should be mark failure after %s", info.FullName(), failureStore.PodName, deadline.Format(time.RFC3339)) glog.Errorf(err.Error()) - return false, err + return false, nil } } @@ -472,8 +472,32 @@ func (oa *operatorActions) GetNodeMap(info *TidbClusterConfig, component string) return nodeMap, nil } -func (oa *operatorActions) CheckOneEtcdDownOrDie(operatorConfig *OperatorConfig, clusters []*TidbClusterConfig, faultNode string) { - glog.Infof("check k8s/operator/tidbCluster status when one etcd down") +func (oa *operatorActions) CheckKubeletDownOrDie(operatorConfig *OperatorConfig, clusters []*TidbClusterConfig, faultNode string) { + glog.Infof("check k8s/operator/tidbCluster status when kubelet down") + KeepOrDie(3*time.Second, 10*time.Minute, func() error { + err := oa.CheckK8sAvailable(nil, nil) + if err != nil { + return err + } + glog.V(4).Infof("k8s cluster is available.") + err = oa.CheckOperatorAvailable(operatorConfig) + if err != nil { + return err + } + glog.V(4).Infof("tidb operator is available.") + err = oa.CheckTidbClustersAvailable(clusters) + if err != nil { + return err + } + glog.V(4).Infof("all clusters are available") + return nil + }) +} + +func (oa *operatorActions) CheckEtcdDownOrDie(operatorConfig *OperatorConfig, clusters []*TidbClusterConfig, faultNode string) { + glog.Infof("check k8s/operator/tidbCluster status when etcd down") + // kube-apiserver may block 15 min + time.Sleep(20 * time.Minute) KeepOrDie(3*time.Second, 10*time.Minute, func() error { err := oa.CheckK8sAvailable(nil, nil) if err != nil { diff --git a/tests/fault.go b/tests/fault.go index 23ea8b5d3f..a6ba6f37e0 100644 --- a/tests/fault.go +++ b/tests/fault.go @@ -4,6 +4,7 @@ import ( "fmt" "math/rand" "os" + "sync" "time" "github.com/pingcap/tidb-operator/tests/slack" @@ -37,8 +38,10 @@ type FaultTriggerActions interface { StopETCDOrDie(nodes ...string) StartETCD(nodes ...string) error StartETCDOrDie(nodes ...string) - StopKubelet(node string) error - StartKubelet(node string) error + StopKubelet(nodes ...string) error + StopKubeletOrDie(nodes ...string) + StartKubelet(nodes ...string) error + StartKubeletOrDie(nodes ...string) StopKubeAPIServer(node string) error StopKubeAPIServerOrDie(node string) StartKubeAPIServer(node string) error @@ -333,22 +336,22 @@ func (fa *faultTriggerActions) StopETCD(nodes ...string) error { } func (fa *faultTriggerActions) StopETCDOrDie(nodes ...string) { + glog.Infof("stopping %v etcds", nodes) if err := fa.StopETCD(nodes...); err != nil { slack.NotifyAndPanic(err) } } -// StartETCD starts the etcd service. -// If the `nodes` is empty, StartETCD will start all etcd service. -func (fa *faultTriggerActions) StartETCD(nodes ...string) error { +// StopKubelet stops the kubelet service. +func (fa *faultTriggerActions) StopKubelet(nodes ...string) error { if len(nodes) == 0 { - for _, ns := range fa.cfg.ETCDs { + for _, ns := range fa.cfg.Nodes { nodes = append(nodes, ns.Nodes...) } } for _, node := range nodes { - if err := fa.serviceAction(node, manager.ETCDService, startAction); err != nil { + if err := fa.serviceAction(node, manager.KubeletService, stopAction); err != nil { return err } } @@ -356,20 +359,64 @@ func (fa *faultTriggerActions) StartETCD(nodes ...string) error { return nil } -func (fa *faultTriggerActions) StartETCDOrDie(nodes ...string) { - if err := fa.StartETCD(nodes...); err != nil { +func (fa *faultTriggerActions) StopKubeletOrDie(nodes ...string) { + glog.Infof("stopping %v kubelets", nodes) + if err := fa.StopKubelet(nodes...); err != nil { slack.NotifyAndPanic(err) } } -// StopKubelet stops the kubelet service. -func (fa *faultTriggerActions) StopKubelet(node string) error { - return fa.serviceAction(node, manager.KubeletService, stopAction) +// StartKubelet starts the kubelet service. +func (fa *faultTriggerActions) StartKubelet(nodes ...string) error { + if len(nodes) == 0 { + for _, ns := range fa.cfg.Nodes { + nodes = append(nodes, ns.Nodes...) + } + } + + for _, node := range nodes { + if err := fa.serviceAction(node, manager.KubeletService, startAction); err != nil { + return err + } + } + + return nil } -// StartKubelet starts the kubelet service. -func (fa *faultTriggerActions) StartKubelet(node string) error { - return fa.serviceAction(node, manager.KubeletService, startAction) +func (fa *faultTriggerActions) StartKubeletOrDie(nodes ...string) { + if err := fa.StartKubelet(nodes...); err != nil { + slack.NotifyAndPanic(err) + } +} + +// StartETCD starts the etcd service. +// If the `nodes` is empty, StartETCD will start all etcd service. +func (fa *faultTriggerActions) StartETCD(nodes ...string) error { + if len(nodes) == 0 { + for _, ns := range fa.cfg.ETCDs { + nodes = append(nodes, ns.Nodes...) + } + } + + var wg sync.WaitGroup + for _, node := range nodes { + wg.Add(1) + go func(n string) { + defer wg.Done() + if err := fa.serviceAction(n, manager.ETCDService, startAction); err != nil { + slack.NotifyAndPanic(fmt.Errorf("failed to start %s etcd, %v", n, err)) + } + }(node) + } + wg.Wait() + + return nil +} + +func (fa *faultTriggerActions) StartETCDOrDie(nodes ...string) { + if err := fa.StartETCD(nodes...); err != nil { + slack.NotifyAndPanic(err) + } } // StopKubeScheduler stops the kube-scheduler service. From 0984a5974aa8d48600feec3e79d748735bf9d4f1 Mon Sep 17 00:00:00 2001 From: weekface Date: Thu, 18 Jul 2019 11:50:59 +0800 Subject: [PATCH 4/6] fix check operator --- tests/failover.go | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/tests/failover.go b/tests/failover.go index 83f20d07ad..b0ed58c789 100644 --- a/tests/failover.go +++ b/tests/failover.go @@ -474,6 +474,7 @@ func (oa *operatorActions) GetNodeMap(info *TidbClusterConfig, component string) func (oa *operatorActions) CheckKubeletDownOrDie(operatorConfig *OperatorConfig, clusters []*TidbClusterConfig, faultNode string) { glog.Infof("check k8s/operator/tidbCluster status when kubelet down") + time.Sleep(10 * time.Minute) KeepOrDie(3*time.Second, 10*time.Minute, func() error { err := oa.CheckK8sAvailable(nil, nil) if err != nil { @@ -711,14 +712,22 @@ func (oa *operatorActions) CheckK8sAvailable(excludeNodes map[string]string, exc } func (oa *operatorActions) CheckOperatorAvailable(operatorConfig *OperatorConfig) error { - return wait.Poll(3*time.Second, 3*time.Minute, func() (bool, error) { + var errCount int + var e error + return wait.Poll(10*time.Second, 3*time.Minute, func() (bool, error) { + if errCount >= 10 { + return true, e + } controllerDeployment, err := oa.kubeCli.AppsV1().Deployments(operatorConfig.Namespace).Get(tidbControllerName, metav1.GetOptions{}) if err != nil { glog.Errorf("failed to get deployment:%s failed,error:%v", tidbControllerName, err) return false, nil } if controllerDeployment.Status.AvailableReplicas != *controllerDeployment.Spec.Replicas { - return false, fmt.Errorf("the %s is not available", tidbControllerName) + e = fmt.Errorf("the %s is not available", tidbControllerName) + glog.Error(e) + errCount++ + return false, nil } schedulerDeployment, err := oa.kubeCli.AppsV1().Deployments(operatorConfig.Namespace).Get(tidbSchedulerName, metav1.GetOptions{}) if err != nil { @@ -726,7 +735,10 @@ func (oa *operatorActions) CheckOperatorAvailable(operatorConfig *OperatorConfig return false, nil } if schedulerDeployment.Status.AvailableReplicas != *schedulerDeployment.Spec.Replicas { - return false, fmt.Errorf("the %s is not available", tidbSchedulerName) + e = fmt.Errorf("the %s is not available", tidbSchedulerName) + glog.Error(e) + errCount++ + return false, nil } return true, nil }) From 78e47a6ab05e25c3c0f955b8ae2d5404fe6e6ec7 Mon Sep 17 00:00:00 2001 From: weekface Date: Mon, 22 Jul 2019 12:02:04 +0800 Subject: [PATCH 5/6] tinyfix --- tests/actions.go | 1 - tests/failover.go | 4 ++-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/tests/actions.go b/tests/actions.go index ceb83ffa94..09e1a93c52 100644 --- a/tests/actions.go +++ b/tests/actions.go @@ -298,7 +298,6 @@ func (tc *TidbClusterConfig) TidbClusterHelmSetString(m map[string]string) strin "tikv.storageClassName": tc.StorageClassName, "tidb.storageClassName": tc.StorageClassName, "tidb.password": tc.Password, - "pd.maxStoreDownTime": "5m", "pd.image": tc.PDImage, "tikv.image": tc.TiKVImage, "tidb.image": tc.TiDBImage, diff --git a/tests/failover.go b/tests/failover.go index b0ed58c789..8d785013c5 100644 --- a/tests/failover.go +++ b/tests/failover.go @@ -276,7 +276,7 @@ func (oa *operatorActions) getPodsByNode(info *TidbClusterConfig, node string) ( } func (oa *operatorActions) CheckFailoverOrDie(clusters []*TidbClusterConfig, faultNode string) { - if err := wait.Poll(1*time.Minute, 30*time.Minute, func() (bool, error) { + if err := wait.Poll(1*time.Minute, 60*time.Minute, func() (bool, error) { var passes []bool for i := range clusters { pass, err := oa.CheckFailover(clusters[i], faultNode) @@ -409,7 +409,7 @@ func (oa *operatorActions) tidbFailover(pod *corev1.Pod, tc *v1alpha1.TidbCluste failure := false for _, failureMember := range tc.Status.TiDB.FailureMembers { if failureMember.PodName == pod.GetName() { - glog.Infof("tidbCluster:[%s/%s]'s store pod:[%s] have not become failuremember", tc.Namespace, tc.Name, pod.Name) + glog.Infof("tidbCluster:[%s/%s]'s store pod:[%s] have become failuremember", tc.Namespace, tc.Name, pod.Name) failure = true break } From b96f3eab087da7aa8f418f0dae8eee0f5a0d0e80 Mon Sep 17 00:00:00 2001 From: weekface Date: Tue, 23 Jul 2019 10:07:07 +0800 Subject: [PATCH 6/6] address comment --- tests/failover.go | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/failover.go b/tests/failover.go index 8d785013c5..0342a8fea5 100644 --- a/tests/failover.go +++ b/tests/failover.go @@ -173,6 +173,7 @@ func (oa *operatorActions) CheckFailoverPending(info *TidbClusterConfig, node st if _, exist := affectedPods[failureStore.PodName]; exist { err := fmt.Errorf("cluster: [%s] the tikv store[%s] should be mark failure after %s", info.FullName(), failureStore.PodName, deadline.Format(time.RFC3339)) glog.Errorf(err.Error()) + // There may have been a failover before return false, nil } }