Skip to content

Commit

Permalink
iam: separate backoffs, add jitter, and increase for conflicts
Browse files Browse the repository at this point in the history
Within our project, we can have multiple instances of terraform
modifying iam policies, and in many cases these instances are kicked off
at exactly the same time. We're running into errors where we exceed the
backoff max (which in reality is 16 seconds, not 30). Also, Google
reccomends that backoffs contain jitter [1] to prevent clients from
retrying all at once in synchronized waves.

This change (1) separates the 3 distinct backoffs used in the iam policy
read-modify-write cycle, (2) introduces jitter on each retry, and (3)
increases the conflict max backoff to 5 minutes.

[1] https://cloud.google.com/iot/docs/how-tos/exponential-backoff
  • Loading branch information
stbenjam committed Dec 22, 2021
1 parent 4bc5161 commit 100cd5f
Showing 1 changed file with 17 additions and 10 deletions.
27 changes: 17 additions & 10 deletions google/iam.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ import (
"encoding/json"
"fmt"
"log"
"math/rand"
"reflect"
"sort"
"strings"
Expand Down Expand Up @@ -80,13 +81,17 @@ func iamPolicyReadModifyWrite(updater ResourceIamUpdater, modify iamPolicyModify
mutexKV.Lock(mutexKey)
defer mutexKV.Unlock(mutexKey)

backoff := time.Second
readBackoff := time.Second
conflictBackoff := time.Second
serviceAccountReadBackoff := time.Second
for {
log.Printf("[DEBUG]: Retrieving policy for %s\n", updater.DescribeResource())
p, err := updater.GetResourceIamPolicy()
if isGoogleApiErrorWithCode(err, 429) {
log.Printf("[DEBUG] 429 while attempting to read policy for %s, waiting %v before attempting again", updater.DescribeResource(), backoff)
time.Sleep(backoff)
readBackoffWithJitter := readBackoff + time.Duration(rand.Intn(1000))*time.Millisecond
log.Printf("[DEBUG] 429 while attempting to read policy for %s, waiting %v before attempting again", updater.DescribeResource(), readBackoffWithJitter)
time.Sleep(readBackoffWithJitter)
readBackoff = readBackoff * 2
continue
} else if err != nil {
return err
Expand Down Expand Up @@ -141,10 +146,11 @@ func iamPolicyReadModifyWrite(updater ResourceIamUpdater, modify iamPolicyModify
break
}
if isConflictError(err) {
log.Printf("[DEBUG]: Concurrent policy changes, restarting read-modify-write after %s\n", backoff)
time.Sleep(backoff)
backoff = backoff * 2
if backoff > 30*time.Second {
conflictBackoffWithJitter := conflictBackoff + time.Duration(rand.Intn(1000))*time.Millisecond
log.Printf("[DEBUG]: Concurrent policy changes, restarting read-modify-write after %v\n", conflictBackoffWithJitter)
time.Sleep(conflictBackoffWithJitter)
conflictBackoff = conflictBackoff * 2
if conflictBackoff > 5*time.Minute {
return errwrap.Wrapf(fmt.Sprintf("Error applying IAM policy to %s: Too many conflicts. Latest error: {{err}}", updater.DescribeResource()), err)
}
continue
Expand All @@ -160,9 +166,10 @@ func iamPolicyReadModifyWrite(updater ResourceIamUpdater, modify iamPolicyModify
if rerr != nil {
if p.Etag != currentPolicy.Etag {
// not matching indicates that there is a new state to attempt to apply
log.Printf("current and old etag did not match for %s, retrying", updater.DescribeResource())
time.Sleep(backoff)
backoff = backoff * 2
serviceAccountBackoffWithJitter := serviceAccountReadBackoff + time.Duration(rand.Intn(1000))*time.Millisecond
log.Printf("current and old etag did not match for %s, retrying after %v", updater.DescribeResource(), serviceAccountBackoffWithJitter)
time.Sleep(serviceAccountBackoffWithJitter)
serviceAccountReadBackoff = serviceAccountReadBackoff * 2
continue
}

Expand Down

0 comments on commit 100cd5f

Please sign in to comment.