From 1ad1a773def0a6c1fb41fed19f06f1a1c24b08b0 Mon Sep 17 00:00:00 2001 From: Stephen Benjamin Date: Wed, 22 Dec 2021 10:50:10 -0500 Subject: [PATCH] iam: separate backoffs, add jitter, and increase for conflicts Within our project, we can have multiple instances of terraform modifying iam policies, and in many cases these instances are kicked off at exactly the same time. We're running into errors where we exceed the backoff max (which in reality is 16 seconds, not 30). Also, Google reccomends that backoffs contain jitter [1] to prevent clients from retrying all at once in synchronized waves. This change (1) separates the 3 distinct backoffs used in the iam policy read-modify-write cycle, (2) introduces jitter on each retry, and (3) increases the conflict max backoff to 5 minutes. [1] https://cloud.google.com/iot/docs/how-tos/exponential-backoff --- google/iam.go | 30 ++++++++++++++++++++---------- 1 file changed, 20 insertions(+), 10 deletions(-) diff --git a/google/iam.go b/google/iam.go index 7f953bf983f..75e7197e5ad 100644 --- a/google/iam.go +++ b/google/iam.go @@ -5,6 +5,7 @@ import ( "encoding/json" "fmt" "log" + "math/rand" "reflect" "sort" "strings" @@ -80,13 +81,20 @@ func iamPolicyReadModifyWrite(updater ResourceIamUpdater, modify iamPolicyModify mutexKV.Lock(mutexKey) defer mutexKV.Unlock(mutexKey) - backoff := time.Second + // Used for introducing jitter in backoffs + rand.Seed(time.Now().UTC().UnixNano()) + + readBackoff := time.Second + conflictBackoff := time.Second + serviceAccountReadBackoff := time.Second for { log.Printf("[DEBUG]: Retrieving policy for %s\n", updater.DescribeResource()) p, err := updater.GetResourceIamPolicy() if isGoogleApiErrorWithCode(err, 429) { - log.Printf("[DEBUG] 429 while attempting to read policy for %s, waiting %v before attempting again", updater.DescribeResource(), backoff) - time.Sleep(backoff) + readBackoffWithJitter := readBackoff + time.Duration(rand.Intn(1000))*time.Millisecond + log.Printf("[DEBUG] 429 while attempting to read policy for %s, waiting %v before attempting again", updater.DescribeResource(), readBackoffWithJitter) + time.Sleep(readBackoffWithJitter) + readBackoff = readBackoff * 2 continue } else if err != nil { return err @@ -141,10 +149,11 @@ func iamPolicyReadModifyWrite(updater ResourceIamUpdater, modify iamPolicyModify break } if isConflictError(err) { - log.Printf("[DEBUG]: Concurrent policy changes, restarting read-modify-write after %s\n", backoff) - time.Sleep(backoff) - backoff = backoff * 2 - if backoff > 30*time.Second { + conflictBackoffWithJitter := conflictBackoff + time.Duration(rand.Intn(1000))*time.Millisecond + log.Printf("[DEBUG]: Concurrent policy changes, restarting read-modify-write after %v\n", conflictBackoffWithJitter) + time.Sleep(conflictBackoffWithJitter) + conflictBackoff = conflictBackoff * 2 + if conflictBackoff > 5*time.Minute { return errwrap.Wrapf(fmt.Sprintf("Error applying IAM policy to %s: Too many conflicts. Latest error: {{err}}", updater.DescribeResource()), err) } continue @@ -160,9 +169,10 @@ func iamPolicyReadModifyWrite(updater ResourceIamUpdater, modify iamPolicyModify if rerr != nil { if p.Etag != currentPolicy.Etag { // not matching indicates that there is a new state to attempt to apply - log.Printf("current and old etag did not match for %s, retrying", updater.DescribeResource()) - time.Sleep(backoff) - backoff = backoff * 2 + serviceAccountBackoffWithJitter := serviceAccountReadBackoff + time.Duration(rand.Intn(1000))*time.Millisecond + log.Printf("current and old etag did not match for %s, retrying after %v", updater.DescribeResource(), serviceAccountBackoffWithJitter) + time.Sleep(serviceAccountBackoffWithJitter) + serviceAccountReadBackoff = serviceAccountReadBackoff * 2 continue }