Skip to content

Commit

Permalink
Use full snapshot interval to compute Backup Ready conditions (#906)
Browse files Browse the repository at this point in the history
* get full snapshot interval from schedule to determine backup ready condition
  • Loading branch information
anveshreddy18 authored Nov 6, 2024
1 parent f15dc84 commit fab4318
Show file tree
Hide file tree
Showing 3 changed files with 34 additions and 7 deletions.
2 changes: 1 addition & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ require (
github.com/hashicorp/go-multierror v1.1.1
github.com/ironcore-dev/vgopath v0.1.4
github.com/prometheus/client_golang v1.18.0
github.com/robfig/cron/v3 v3.0.1
github.com/spf13/pflag v1.0.5
go.uber.org/zap v1.27.0
golang.org/x/exp v0.0.0-20240707233637-46b078467d37
Expand Down Expand Up @@ -111,7 +112,6 @@ require (
github.com/prometheus/client_model v0.6.0 // indirect
github.com/prometheus/common v0.45.0 // indirect
github.com/prometheus/procfs v0.12.0 // indirect
github.com/robfig/cron/v3 v3.0.1 // indirect
github.com/shopspring/decimal v1.3.1 // indirect
github.com/sirupsen/logrus v1.9.3 // indirect
github.com/spf13/afero v1.11.0 // indirect
Expand Down
21 changes: 15 additions & 6 deletions internal/health/condition/check_backup_ready.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ import (
"time"

druidv1alpha1 "github.com/gardener/etcd-druid/api/v1alpha1"
"github.com/gardener/etcd-druid/internal/utils"

coordinationv1 "k8s.io/api/coordination/v1"
"k8s.io/apimachinery/pkg/types"
Expand Down Expand Up @@ -48,13 +49,21 @@ func (a *backupReadyCheck) Check(ctx context.Context, etcd druidv1alpha1.Etcd) R

//Fetch snapshot leases
var (
fullSnapErr, incrSnapErr error
fullSnapLease = &coordinationv1.Lease{}
deltaSnapLease = &coordinationv1.Lease{}
fullSnapErr, incrSnapErr, err error
fullSnapLease = &coordinationv1.Lease{}
fullSnapshotInterval = 24 * time.Hour
deltaSnapLease = &coordinationv1.Lease{}
)
fullSnapErr = a.cl.Get(ctx, types.NamespacedName{Name: getFullSnapLeaseName(&etcd), Namespace: etcd.ObjectMeta.Namespace}, fullSnapLease)
incrSnapErr = a.cl.Get(ctx, types.NamespacedName{Name: getDeltaSnapLeaseName(&etcd), Namespace: etcd.ObjectMeta.Namespace}, deltaSnapLease)

// Compute the full snapshot interval if full snapshot schedule is set
if etcd.Spec.Backup.FullSnapshotSchedule != nil {
if fullSnapshotInterval, err = utils.ComputeScheduleInterval(*etcd.Spec.Backup.FullSnapshotSchedule); err != nil {
return result
}
}

//Set status to Unknown if errors in fetching snapshot leases or lease never renewed
if fullSnapErr != nil || incrSnapErr != nil || (fullSnapLease.Spec.RenewTime == nil && deltaSnapLease.Spec.RenewTime == nil) {
return result
Expand All @@ -66,8 +75,8 @@ func (a *backupReadyCheck) Check(ctx context.Context, etcd druidv1alpha1.Etcd) R

if fullLeaseRenewTime == nil && deltaLeaseRenewTime != nil {
// Most probable during reconcile of existing clusters if fresh leases are created
// Treat backup as succeeded if delta snap lease renewal happens in the required time window and full snap lease is not older than 24h.
if time.Since(deltaLeaseRenewTime.Time) < 2*etcd.Spec.Backup.DeltaSnapshotPeriod.Duration && time.Since(fullLeaseCreateTime.Time) < 24*time.Hour {
// Treat backup as succeeded if delta snap lease renewal happens in the required time window and full snap lease is not older than fullSnapshotInterval
if time.Since(deltaLeaseRenewTime.Time) < 2*etcd.Spec.Backup.DeltaSnapshotPeriod.Duration && time.Since(fullLeaseCreateTime.Time) < fullSnapshotInterval {
result.reason = BackupSucceeded
result.message = "Delta snapshot backup succeeded"
result.status = druidv1alpha1.ConditionTrue
Expand All @@ -82,7 +91,7 @@ func (a *backupReadyCheck) Check(ctx context.Context, etcd druidv1alpha1.Etcd) R
}
} else if deltaLeaseRenewTime != nil && fullLeaseRenewTime != nil {
//Both snap leases are maintained. Both are expected to be renewed periodically
if time.Since(deltaLeaseRenewTime.Time) < 2*etcd.Spec.Backup.DeltaSnapshotPeriod.Duration && time.Since(fullLeaseRenewTime.Time) < 24*time.Hour {
if time.Since(deltaLeaseRenewTime.Time) < 2*etcd.Spec.Backup.DeltaSnapshotPeriod.Duration && time.Since(fullLeaseRenewTime.Time) < fullSnapshotInterval {
result.reason = BackupSucceeded
result.message = "Snapshot backup succeeded"
result.status = druidv1alpha1.ConditionTrue
Expand Down
18 changes: 18 additions & 0 deletions internal/utils/miscellaneous.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,9 @@ import (
"fmt"
"maps"
"strings"
"time"

"github.com/robfig/cron/v3"
"sigs.k8s.io/controller-runtime/pkg/client"
)

Expand Down Expand Up @@ -63,3 +65,19 @@ func IfConditionOr[T any](condition bool, trueVal, falseVal T) T {
}
return falseVal
}

// ComputeScheduleInterval computes the interval between two activations for the given cron schedule.
// Assumes that every cron activation is at equal intervals apart, based on cron schedules such as
// "once every X hours", "once every Y days", "at 1:00pm on every Tuesday", etc.
// TODO: write a new function to accurately compute the previous activation time from the cron schedule
// in order to compute when the previous activation of the cron schedule was supposed to have occurred,
// instead of relying on the assumption that all the cron activations are evenly spaced.
func ComputeScheduleInterval(cronSchedule string) (time.Duration, error) {
schedule, err := cron.ParseStandard(cronSchedule)
if err != nil {
return 0, err
}
nextScheduledTime := schedule.Next(time.Now())
nextNextScheduledTime := schedule.Next(nextScheduledTime)
return nextNextScheduledTime.Sub(nextScheduledTime), nil
}

0 comments on commit fab4318

Please sign in to comment.