From 66f76050161a44629e9763206855a93d0b920d95 Mon Sep 17 00:00:00 2001 From: Alex Wu Date: Wed, 23 Nov 2022 11:42:56 -0600 Subject: [PATCH 1/2] launcher: Change max retry policy to hourly We want the retry to occur ~hourly (with jitter), as this will match the normal refreshed requests. This allows the launcher to provide a token even with long service outages. --- launcher/container_runner.go | 66 ++++++++++++++++-------------------- 1 file changed, 30 insertions(+), 36 deletions(-) diff --git a/launcher/container_runner.go b/launcher/container_runner.go index bc6a7c87..63c5c4ef 100644 --- a/launcher/container_runner.go +++ b/launcher/container_runner.go @@ -57,35 +57,7 @@ const ( snapshotID = "tee-snapshot" ) -/* - Values for token refresh and retries. - -With a 60m token expiration, the refresher goroutine will refresh beginning at .9*60=54m. - -Given the following default arguments, the retry sequence will be, -assuming we go over the MaxElapsedTime: - - RetryInterval = 30s - RandomizationFactor = 0.5 - Multiplier = 1.5 - MaxInterval = 180s - MaxElapsedTime = 600s - - Request # RetryInterval (seconds) Randomized Interval (seconds) - RetryInterval*[1-RandFactor, 1+RandFactor] - 1 30 [15, 45] - 2 60 [30, 90] - 3 120 [60, 180] - 4 180 (MaxInterval) [90, 270] - 5 180 (MaxInterval) [90, 270] - reached MaxElapsedTime backoff.Stop -*/ -const ( - defaultRefreshMultiplier = 0.9 - defaultInitialInterval = 30 * time.Second - defaultMaxInterval = 3 * time.Minute - defaultMaxElapsedTime = 10 * time.Minute -) +const defaultRefreshMultiplier = 0.9 func fetchImpersonatedToken(ctx context.Context, serviceAccount string, audience string, opts ...option.ClientOption) ([]byte, error) { config := impersonate.IDTokenConfig{ @@ -427,15 +399,37 @@ func (r *ContainerRunner) fetchAndWriteTokenWithRetry(ctx context.Context, return nil } -// defaultRetryPolicy retries with: -// initial interval of 30s, multiplication factor of 1.5 -// randomization factor of 0.5, max interval of 3m, and -// max elapsed time of 10m. +/* +defaultRetryPolicy retries as follows: + +Given the following arguments, the retry sequence will be: + + RetryInterval = 60 sec + RandomizationFactor = 0.5 + Multiplier = 2 + MaxInterval = 3600 sec + MaxElapsedTime = 0 (never stops retrying) + + Request # RetryInterval (seconds) Randomized Interval (seconds) + RetryInterval*[1-RandFactor, 1+RandFactor] + 1 60 [30, 90] + 2 120 [60, 180] + 3 240 [120, 360] + 4 480 [240, 720] + 5 960 [480, 1440] + 6 1920 [960, 2880] + 7 3600 (MaxInterval) [1800, 5400] + 8 3600 (MaxInterval) [1800, 5400] + ... +*/ func defaultRetryPolicy() *backoff.ExponentialBackOff { expBack := backoff.NewExponentialBackOff() - expBack.InitialInterval = defaultInitialInterval - expBack.MaxInterval = defaultMaxInterval - expBack.MaxElapsedTime = defaultMaxElapsedTime + expBack.InitialInterval = time.Minute + expBack.RandomizationFactor = 0.5 + expBack.Multiplier = 2 + expBack.MaxInterval = time.Hour + // Never stop retrying. + expBack.MaxElapsedTime = 0 return expBack } From 066fa98a19ed4611f9d9d29f9769450bf207f58b Mon Sep 17 00:00:00 2001 From: Alex Wu Date: Wed, 23 Nov 2022 11:53:40 -0600 Subject: [PATCH 2/2] launcher: Add jitter to token refresh time This allows smearing of requests in the event many clients come online at similar times after an outage. Without this, load on the service side would be less balanced over time. Also, test next refresh time is always < expiration. --- launcher/container_runner.go | 24 ++++++++++++++++++++++-- launcher/container_runner_test.go | 16 ++++++++++++++++ 2 files changed, 38 insertions(+), 2 deletions(-) diff --git a/launcher/container_runner.go b/launcher/container_runner.go index 63c5c4ef..3f91e95e 100644 --- a/launcher/container_runner.go +++ b/launcher/container_runner.go @@ -8,6 +8,7 @@ import ( "fmt" "io" "log" + "math/rand" "net/url" "os" "path" @@ -57,7 +58,16 @@ const ( snapshotID = "tee-snapshot" ) -const defaultRefreshMultiplier = 0.9 +const ( + // defaultRefreshMultiplier is a multiplier on the current token expiration + // time, at which the refresher goroutine will collect a new token. + // defaultRefreshMultiplier+defaultRefreshJitter should be <1. + defaultRefreshMultiplier = 0.8 + // defaultRefreshJitter is a random component applied additively to the + // refresh multiplier. The refresher will wait for some time in the range + // [defaultRefreshMultiplier-defaultRefreshJitter, defaultRefreshMultiplier+defaultRefreshJitter] + defaultRefreshJitter = 0.1 +) func fetchImpersonatedToken(ctx context.Context, serviceAccount string, audience string, opts ...option.ClientOption) ([]byte, error) { config := impersonate.IDTokenConfig{ @@ -345,7 +355,7 @@ func (r *ContainerRunner) refreshToken(ctx context.Context) (time.Duration, erro } r.logger.Println(string(claimsString)) - return time.Duration(float64(time.Until(claims.ExpiresAt.Time)) * defaultRefreshMultiplier), nil + return getNextRefreshFromExpiration(time.Until(claims.ExpiresAt.Time), rand.Float64()), nil } // ctx must be a cancellable context. @@ -399,6 +409,16 @@ func (r *ContainerRunner) fetchAndWriteTokenWithRetry(ctx context.Context, return nil } +// getNextRefreshFromExpiration returns the Duration for the next run of the +// token refresher goroutine. It expects pre-validation that expiration is in +// the future (e.g., time.Now < expiration). +func getNextRefreshFromExpiration(expiration time.Duration, random float64) time.Duration { + diff := defaultRefreshJitter * float64(expiration) + center := defaultRefreshMultiplier * float64(expiration) + minRange := center - diff + return time.Duration(minRange + random*2*diff) +} + /* defaultRetryPolicy retries as follows: diff --git a/launcher/container_runner_test.go b/launcher/container_runner_test.go index 1b2663dc..d1a32a56 100644 --- a/launcher/container_runner_test.go +++ b/launcher/container_runner_test.go @@ -448,3 +448,19 @@ func TestFetchImpersonatedToken(t *testing.T) { t.Errorf("fetchImpersonatedToken did not return expected token: got %v, want %v", token, expectedToken) } } + +func TestGetNextRefresh(t *testing.T) { + // 0 <= random < 1. + for _, randNum := range []float64{0, .1415926, .5, .75, .999999999} { + // expiration should always be >0. + // 0 or negative expiration means the token has already expired. + for _, expInt := range []int64{1, 10, 100, 1000, 10000, 1000000} { + expDuration := time.Duration(expInt) + next := getNextRefreshFromExpiration(expDuration, randNum) + if next >= expDuration { + t.Errorf("getNextRefreshFromExpiration(%v, %v) = %v next refresh. expected %v (next refresh) < %v (expiration)", + expDuration, randNum, next, next, expDuration) + } + } + } +}