From b71a17a79d4ec9d75944fa08dee72cc2d110fca9 Mon Sep 17 00:00:00 2001 From: Stephen Benjamin Date: Wed, 22 Dec 2021 10:50:10 -0500 Subject: [PATCH] iam: separate backoffs, add jitter, and increase for conflicts Within our project, we can have multiple instances of terraform modifying iam policies, and in many cases these instances are kicked off at exactly the same time. We're running into errors where we exceed the backoff max (which in reality is 16 seconds, not 30). Also, Google reccomends that backoffs contain jitter [1] to prevent clients from retrying all at once in synchronized waves. This change (1) separates the 3 distinct backoffs used in the iam policy read-modify-write cycle, (2) introduces jitter on each retry, and (3) increases the conflict max backoff to 5 minutes. [1] https://cloud.google.com/iot/docs/how-tos/exponential-backoff --- google/iam.go | 23 ++++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/google/iam.go b/google/iam.go index 99b05788a26..fcf9c3e0029 100644 --- a/google/iam.go +++ b/google/iam.go @@ -5,6 +5,7 @@ import ( "encoding/json" "fmt" "log" + "math/rand" "reflect" "sort" "strings" @@ -79,13 +80,19 @@ func iamPolicyReadModifyWrite(updater ResourceIamUpdater, modify iamPolicyModify mutexKV.Lock(mutexKey) defer mutexKV.Unlock(mutexKey) - backoff := time.Second + // Used for introducing jitter in backoffs + rand.Seed(time.Now().UTC().UnixNano()) + + readBackoff := time.Second + conflictBackoff := time.Second for { log.Printf("[DEBUG]: Retrieving policy for %s\n", updater.DescribeResource()) p, err := updater.GetResourceIamPolicy() if isGoogleApiErrorWithCode(err, 429) { - log.Printf("[DEBUG] 429 while attempting to read policy for %s, waiting %v before attempting again", updater.DescribeResource(), backoff) - time.Sleep(backoff) + readBackoffWithJitter := readBackoff + time.Duration(rand.Intn(1000))*time.Millisecond + log.Printf("[DEBUG] 429 while attempting to read policy for %s, waiting %v before attempting again", updater.DescribeResource(), readBackoffWithJitter) + time.Sleep(readBackoffWithJitter) + readBackoff = readBackoff * 2 continue } else if err != nil { return err @@ -140,14 +147,16 @@ func iamPolicyReadModifyWrite(updater ResourceIamUpdater, modify iamPolicyModify break } if isConflictError(err) { - log.Printf("[DEBUG]: Concurrent policy changes, restarting read-modify-write after %s\n", backoff) - time.Sleep(backoff) - backoff = backoff * 2 - if backoff > 30*time.Second { + conflictBackoffWithJitter := conflictBackoff + time.Duration(rand.Intn(1000))*time.Millisecond + log.Printf("[DEBUG]: Concurrent policy changes, restarting read-modify-write after %v\n", conflictBackoffWithJitter) + time.Sleep(conflictBackoffWithJitter) + conflictBackoff = conflictBackoff * 2 + if conflictBackoff > 5*time.Minute { return errwrap.Wrapf(fmt.Sprintf("Error applying IAM policy to %s: Too many conflicts. Latest error: {{err}}", updater.DescribeResource()), err) } continue } + return errwrap.Wrapf(fmt.Sprintf("Error applying IAM policy for %s: {{err}}", updater.DescribeResource()), err) } log.Printf("[DEBUG]: Set policy for %s", updater.DescribeResource())