Skip to content

Commit

Permalink
stability: retry truncating sst files upon failure (#484)
Browse files Browse the repository at this point in the history
* stability: retry truncating sst files upon failure
  • Loading branch information
zyguan authored and weekface committed May 27, 2019
1 parent 8aa2386 commit 8360887
Show file tree
Hide file tree
Showing 2 changed files with 61 additions and 30 deletions.
7 changes: 5 additions & 2 deletions tests/failover.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,6 @@ import (
"strings"
"time"

"github.com/pingcap/tidb-operator/tests/slack"

// To register MySQL driver
_ "github.com/go-sql-driver/mysql"
"github.com/golang/glog"
Expand All @@ -17,6 +15,7 @@ import (
"github.com/pingcap/tidb-operator/pkg/label"
"github.com/pingcap/tidb-operator/tests/pkg/client"
"github.com/pingcap/tidb-operator/tests/pkg/ops"
"github.com/pingcap/tidb-operator/tests/slack"
corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/labels"
Expand Down Expand Up @@ -65,6 +64,8 @@ func (oa *operatorActions) TruncateSSTFileThenCheckFailover(info *TidbClusterCon
glog.Infof("deleting pod: [%s/%s] and wait 1 minute for the pod to terminate", info.Namespace, store.PodName)
err = cli.CoreV1().Pods(info.Namespace).Delete(store.PodName, nil)
if err != nil {
glog.Errorf("failed to get delete the pod: ns=%s tc=%s pod=%s err=%s",
info.Namespace, info.ClusterName, store.PodName, err.Error())
return err
}

Expand All @@ -77,6 +78,8 @@ func (oa *operatorActions) TruncateSSTFileThenCheckFailover(info *TidbClusterCon
Store: store.ID,
})
if err != nil {
glog.Errorf("failed to truncate the sst file: ns=%s tc=%s store=%s err=%s",
info.Namespace, info.ClusterName, store.ID, err.Error())
return err
}
oa.EmitEvent(info, fmt.Sprintf("TruncateSSTFile: tikv: %s/%s", info.Namespace, store.PodName))
Expand Down
84 changes: 56 additions & 28 deletions tests/pkg/ops/tikv.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,13 +14,21 @@
package ops

import (
"fmt"
"strconv"
"strings"
"time"

"github.com/golang/glog"
"github.com/pingcap/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
)

const (
retryLimit = 15
maxSSTFilesToTruncate = 20
)

type TruncateOptions struct {
Namespace string
Cluster string
Expand All @@ -32,7 +40,7 @@ type TiKVOps struct {
}

func (ops *TiKVOps) TruncateSSTFile(opts TruncateOptions) error {
glog.Infof("truncate sst option: %+v", opts)
logHdr := fmt.Sprintf("store: %s cluster: [%s/%s] ", opts.Store, opts.Namespace, opts.Cluster)

tc, err := ops.PingcapV1alpha1().TidbClusters(opts.Namespace).Get(opts.Cluster, metav1.GetOptions{})
if err != nil {
Expand All @@ -54,41 +62,61 @@ func (ops *TiKVOps) TruncateSSTFile(opts TruncateOptions) error {
})
}

stdout, stderr, err := exec("find", "/var/lib/tikv/db", "-name", "*.sst", "-o", "-name", "*.save")
if err != nil {
glog.Errorf("list sst files: stderr=%s err=%s", stderr, err.Error())
return errors.Annotate(err, "list sst files")
}
retryCount := 0
for ; retryCount < retryLimit; retryCount++ {
if retryCount > 0 {
time.Sleep(10 * time.Second)
}
stdout, stderr, err := exec("find", "/var/lib/tikv/db", "-name", "*.sst", "-o", "-name", "*.save")
if err != nil {
glog.Warningf(logHdr+"list sst files: stderr=%s err=%s", stderr, err.Error())
continue
}

sstCandidates := make(map[string]bool)
sstCandidates := make(map[string]bool)

for _, f := range strings.Split(stdout, "\n") {
f = strings.TrimSpace(f)
if len(f) > 0 {
sstCandidates[f] = true
for _, f := range strings.Split(stdout, "\n") {
f = strings.TrimSpace(f)
if len(f) > 0 {
sstCandidates[f] = true
}
}
}

sst := ""
for k := range sstCandidates {
if strings.HasSuffix(k, ".sst") && !sstCandidates[k+".save"] {
sst = k
ssts := make([]string, 0, maxSSTFilesToTruncate)
for k := range sstCandidates {
if len(ssts) >= maxSSTFilesToTruncate {
break
}
if strings.HasSuffix(k, ".sst") && !sstCandidates[k+".save"] {
ssts = append(ssts, k)
}
}
if len(ssts) == 0 {
glog.Warning(logHdr + "cannot find a sst file")
continue
}
}
if len(sst) == 0 {
return errors.New("cannot find a sst file")
}

_, stderr, err = exec("cp", sst, sst+".save")
if err != nil {
glog.Errorf("backup sst file: stderr=%s err=%s", stderr, err.Error())
return errors.Annotate(err, "backup sst file")
truncated := 0
for _, sst := range ssts {
_, stderr, err = exec("sh", "-c",
fmt.Sprintf("cp %s %s.save && truncate -s 0 %s", sst, sst, sst))
if err != nil {
glog.Warningf(logHdr+"truncate sst file: sst=%s stderr=%s err=%s", sst, stderr, err.Error())
continue
}
truncated++
}
if truncated == 0 {
glog.Warningf(logHdr + "no sst file has been truncated")
continue
}

glog.Infof(logHdr+"%d sst files got truncated", truncated)
break
}

_, stderr, err = exec("truncate", "-s", "0", sst)
if err != nil {
glog.Errorf("truncate sst file: stderr=%s err=%s", stderr, err.Error())
return errors.Annotate(err, "truncate sst file")
if retryCount == retryLimit {
return errors.New("failed to truncate sst file after " + strconv.Itoa(retryLimit) + " trials")
}

return nil
Expand Down

0 comments on commit 8360887

Please sign in to comment.