Skip to content

Commit

Permalink
Add jitter to scheduled snapshots and retry harder on conflicts
Browse files Browse the repository at this point in the history
Also ensure that the snapshot job does not attempt to trigger multiple concurrent runs, as this is not supported.

Signed-off-by: Brad Davidson <brad.davidson@rancher.com>
  • Loading branch information
brandond committed Jan 26, 2023
1 parent 9176e03 commit 6344590
Showing 1 changed file with 28 additions and 4 deletions.
32 changes: 28 additions & 4 deletions pkg/etcd/etcd.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ import (
"fmt"
"io"
"io/fs"
"math/rand"
"net"
"net/http"
"net/url"
Expand Down Expand Up @@ -49,6 +50,7 @@ import (
apierrors "k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
utilnet "k8s.io/apimachinery/pkg/util/net"
"k8s.io/apimachinery/pkg/util/wait"
"k8s.io/client-go/util/retry"
)

Expand All @@ -58,6 +60,10 @@ const (
learnerMaxStallTime = time.Minute * 5
memberRemovalTimeout = time.Minute * 1

// snapshotJitterMax defines the maximum time skew on cron-triggered snapshots. The actual jitter
// will be a random Duration somewhere between 0 and snapshotJitterMax.
snapshotJitterMax = time.Second * 5

// defaultDialTimeout is intentionally short so that connections timeout within the testTimeout defined above
defaultDialTimeout = 2 * time.Second
// other defaults from k8s.io/apiserver/pkg/storage/storagebackend/factory/etcd3.go
Expand All @@ -81,6 +87,19 @@ var (
snapshotExtraMetadataConfigMapName = version.Program + "-etcd-snapshot-extra-metadata"
snapshotConfigMapName = version.Program + "-etcd-snapshots"

// snapshotDataBackoff will retry at increasing steps for up to ~30 seconds.
// If the ConfigMap update fails, the list won't be reconciled again until next time
// the server starts, so we should be fairly persistent in retrying.
snapshotDataBackoff = wait.Backoff{
Steps: 9,
Duration: 10 * time.Millisecond,
Factor: 3.0,
Jitter: 0.1,
}

// cronLogger wraps logrus's Printf output as cron-compatible logger
cronLogger = cron.VerbosePrintfLogger(logrus.StandardLogger())

NodeNameAnnotation = "etcd." + version.Program + ".cattle.io/node-name"
NodeAddressAnnotation = "etcd." + version.Program + ".cattle.io/node-address"

Expand Down Expand Up @@ -139,7 +158,7 @@ func errNotMember() error { return &MembershipError{} }
// ETCD with an initialized cron value.
func NewETCD() *ETCD {
return &ETCD{
cron: cron.New(),
cron: cron.New(cron.WithLogger(cronLogger)),
}
}

Expand Down Expand Up @@ -1712,7 +1731,7 @@ func (e *ETCD) DeleteSnapshots(ctx context.Context, snapshots []string) error {
// AddSnapshotData adds the given snapshot file information to the snapshot configmap, using the existing extra metadata
// available at the time.
func (e *ETCD) addSnapshotData(sf snapshotFile) error {
return retry.OnError(retry.DefaultBackoff, func(err error) bool {
return retry.OnError(snapshotDataBackoff, func(err error) bool {
return apierrors.IsConflict(err) || apierrors.IsAlreadyExists(err)
}, func() error {
// make sure the core.Factory is initialized. There can
Expand Down Expand Up @@ -1920,11 +1939,16 @@ func (e *ETCD) ReconcileSnapshotData(ctx context.Context) error {

// setSnapshotFunction schedules snapshots at the configured interval.
func (e *ETCD) setSnapshotFunction(ctx context.Context) {
e.cron.AddFunc(e.config.EtcdSnapshotCron, func() {
skipJob := cron.SkipIfStillRunning(cronLogger)
e.cron.AddJob(e.config.EtcdSnapshotCron, skipJob(cron.FuncJob(func() {
// Add a small amount of jitter to the actual snapshot execution. On clusters with multiple servers,
// having all the nodes take a snapshot at the exact same time can lead to excessive retry thrashing
// when updating the snapshot list configmap.
time.Sleep(time.Duration(rand.Float64() * float64(snapshotJitterMax)))
if err := e.Snapshot(ctx, e.config); err != nil {
logrus.Error(err)
}
})
})))
}

// Restore performs a restore of the ETCD datastore from
Expand Down

0 comments on commit 6344590

Please sign in to comment.