From 8360887741badd6eb69c3dca9125f72de877d87f Mon Sep 17 00:00:00 2001 From: zyguan Date: Mon, 27 May 2019 14:56:21 +0800 Subject: [PATCH] stability: retry truncating sst files upon failure (#484) * stability: retry truncating sst files upon failure --- tests/failover.go | 7 ++-- tests/pkg/ops/tikv.go | 84 ++++++++++++++++++++++++++++--------------- 2 files changed, 61 insertions(+), 30 deletions(-) diff --git a/tests/failover.go b/tests/failover.go index 798a37def9..f6ae2544e7 100644 --- a/tests/failover.go +++ b/tests/failover.go @@ -7,8 +7,6 @@ import ( "strings" "time" - "github.com/pingcap/tidb-operator/tests/slack" - // To register MySQL driver _ "github.com/go-sql-driver/mysql" "github.com/golang/glog" @@ -17,6 +15,7 @@ import ( "github.com/pingcap/tidb-operator/pkg/label" "github.com/pingcap/tidb-operator/tests/pkg/client" "github.com/pingcap/tidb-operator/tests/pkg/ops" + "github.com/pingcap/tidb-operator/tests/slack" corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/labels" @@ -65,6 +64,8 @@ func (oa *operatorActions) TruncateSSTFileThenCheckFailover(info *TidbClusterCon glog.Infof("deleting pod: [%s/%s] and wait 1 minute for the pod to terminate", info.Namespace, store.PodName) err = cli.CoreV1().Pods(info.Namespace).Delete(store.PodName, nil) if err != nil { + glog.Errorf("failed to get delete the pod: ns=%s tc=%s pod=%s err=%s", + info.Namespace, info.ClusterName, store.PodName, err.Error()) return err } @@ -77,6 +78,8 @@ func (oa *operatorActions) TruncateSSTFileThenCheckFailover(info *TidbClusterCon Store: store.ID, }) if err != nil { + glog.Errorf("failed to truncate the sst file: ns=%s tc=%s store=%s err=%s", + info.Namespace, info.ClusterName, store.ID, err.Error()) return err } oa.EmitEvent(info, fmt.Sprintf("TruncateSSTFile: tikv: %s/%s", info.Namespace, store.PodName)) diff --git a/tests/pkg/ops/tikv.go b/tests/pkg/ops/tikv.go index ddf1da4903..9e399449c3 100644 --- a/tests/pkg/ops/tikv.go +++ b/tests/pkg/ops/tikv.go @@ -14,13 +14,21 @@ package ops import ( + "fmt" + "strconv" "strings" + "time" "github.com/golang/glog" "github.com/pingcap/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" ) +const ( + retryLimit = 15 + maxSSTFilesToTruncate = 20 +) + type TruncateOptions struct { Namespace string Cluster string @@ -32,7 +40,7 @@ type TiKVOps struct { } func (ops *TiKVOps) TruncateSSTFile(opts TruncateOptions) error { - glog.Infof("truncate sst option: %+v", opts) + logHdr := fmt.Sprintf("store: %s cluster: [%s/%s] ", opts.Store, opts.Namespace, opts.Cluster) tc, err := ops.PingcapV1alpha1().TidbClusters(opts.Namespace).Get(opts.Cluster, metav1.GetOptions{}) if err != nil { @@ -54,41 +62,61 @@ func (ops *TiKVOps) TruncateSSTFile(opts TruncateOptions) error { }) } - stdout, stderr, err := exec("find", "/var/lib/tikv/db", "-name", "*.sst", "-o", "-name", "*.save") - if err != nil { - glog.Errorf("list sst files: stderr=%s err=%s", stderr, err.Error()) - return errors.Annotate(err, "list sst files") - } + retryCount := 0 + for ; retryCount < retryLimit; retryCount++ { + if retryCount > 0 { + time.Sleep(10 * time.Second) + } + stdout, stderr, err := exec("find", "/var/lib/tikv/db", "-name", "*.sst", "-o", "-name", "*.save") + if err != nil { + glog.Warningf(logHdr+"list sst files: stderr=%s err=%s", stderr, err.Error()) + continue + } - sstCandidates := make(map[string]bool) + sstCandidates := make(map[string]bool) - for _, f := range strings.Split(stdout, "\n") { - f = strings.TrimSpace(f) - if len(f) > 0 { - sstCandidates[f] = true + for _, f := range strings.Split(stdout, "\n") { + f = strings.TrimSpace(f) + if len(f) > 0 { + sstCandidates[f] = true + } } - } - sst := "" - for k := range sstCandidates { - if strings.HasSuffix(k, ".sst") && !sstCandidates[k+".save"] { - sst = k + ssts := make([]string, 0, maxSSTFilesToTruncate) + for k := range sstCandidates { + if len(ssts) >= maxSSTFilesToTruncate { + break + } + if strings.HasSuffix(k, ".sst") && !sstCandidates[k+".save"] { + ssts = append(ssts, k) + } + } + if len(ssts) == 0 { + glog.Warning(logHdr + "cannot find a sst file") + continue } - } - if len(sst) == 0 { - return errors.New("cannot find a sst file") - } - _, stderr, err = exec("cp", sst, sst+".save") - if err != nil { - glog.Errorf("backup sst file: stderr=%s err=%s", stderr, err.Error()) - return errors.Annotate(err, "backup sst file") + truncated := 0 + for _, sst := range ssts { + _, stderr, err = exec("sh", "-c", + fmt.Sprintf("cp %s %s.save && truncate -s 0 %s", sst, sst, sst)) + if err != nil { + glog.Warningf(logHdr+"truncate sst file: sst=%s stderr=%s err=%s", sst, stderr, err.Error()) + continue + } + truncated++ + } + if truncated == 0 { + glog.Warningf(logHdr + "no sst file has been truncated") + continue + } + + glog.Infof(logHdr+"%d sst files got truncated", truncated) + break } - _, stderr, err = exec("truncate", "-s", "0", sst) - if err != nil { - glog.Errorf("truncate sst file: stderr=%s err=%s", stderr, err.Error()) - return errors.Annotate(err, "truncate sst file") + if retryCount == retryLimit { + return errors.New("failed to truncate sst file after " + strconv.Itoa(retryLimit) + " trials") } return nil