Skip to content

Commit

Permalink
Randomly choose backoff when bgw unavailable
Browse files Browse the repository at this point in the history
  • Loading branch information
konskov committed Aug 31, 2022
1 parent 07abe53 commit 07b074f
Showing 1 changed file with 25 additions and 6 deletions.
31 changes: 25 additions & 6 deletions src/bgw/job_stat.c
Original file line number Diff line number Diff line change
Expand Up @@ -199,8 +199,24 @@ calculate_next_start_on_failure(TimestampTz finish_time, int consecutive_failure
TimestampTz res = 0;
volatile bool res_set = false;
TimestampTz last_finish = finish_time;
float8 exponent = (consecutive_failures > MAX_FAILURES_MULTIPLIER ? MAX_FAILURES_MULTIPLIER :
consecutive_failures);
int exponent = (consecutive_failures > MAX_FAILURES_MULTIPLIER ? MAX_FAILURES_MULTIPLIER :
consecutive_failures);
/*
* the aim of backoff is to reduce conflicts in the case of jobs that fail to launch,
* as there are jobs competing for available background workers in that case.
* therefore we introduce some randomness in backoff calculation for that type of failure
* by randomly selecting a backoff time in the interval [2, 2^consecutive_failures]
* Otherwise as soon as a worker becomes available, all jobs will try to grab it, but only one
* will anyway so why should they all try at once? so jobs that experience failures will delay
* their next_start a random amount to reduce the probability of collisions
*/
Assert(consecutive_failures > 0);
// 2^(consecutive_failures) - 1, at most 2^20
int64 max_slots = (1 << consecutive_failures) - 1;
// granularity: 1 usec (2^20)
// will get a random int in [0, (2^f - 1) * 1000000]
// this represents a random amount of microseconds to backoff
int64 rand_backoff = random() % (max_slots * USECS_PER_SEC);
MemoryContext oldctx;

if (!IS_VALID_TIMESTAMP(finish_time))
Expand All @@ -217,22 +233,25 @@ calculate_next_start_on_failure(TimestampTz finish_time, int consecutive_failure
// max wait time to launch job is 1 minute
Interval interval_max = { .time = 60000000 };
Interval retry_ival = { .time = 2000000 };
retry_ival.time += rand_backoff;
/* NULL job means the failure was a launch failure */
if (job)
ival = IntervalPGetDatum(&job->fd.retry_period);
else
{
// retry every 2 seconds
// random backoff seconds in [2, 2 + 2^f]
ival = IntervalPGetDatum(&retry_ival);
}

/* ival = retry_period ^ (consecutive_failures - 1) */
/* arbitrarily choose 2 for exponential growth */
for (i = 0; i < exponent - 1; i++)
if (job)
{
ival = DirectFunctionCall2(interval_mul, ival, Float8GetDatum(2));
for (i = 0; i < exponent - 1; i++)
{
ival = DirectFunctionCall2(interval_mul, ival, Float8GetDatum(2));
}
}

/* ival_max is the ceiling = MAX_INTERVALS_BACKOFF * schedule_interval */
Datum ival_max;
if (job)
Expand Down

0 comments on commit 07b074f

Please sign in to comment.