From 127e50500c8a05d34a96ad75f0a4d00be007020a Mon Sep 17 00:00:00 2001 From: Andrey Date: Thu, 26 Sep 2024 11:25:17 +0200 Subject: [PATCH] retry HTTP request on 429 and 5xx responses --- lib/datadog/ci/ext/transport.rb | 1 + lib/datadog/ci/transport/http.rb | 59 +++++++++++--- sig/datadog/ci/ext/transport.rbs | 2 + sig/datadog/ci/transport/http.rbs | 1 + spec/datadog/ci/transport/http_spec.rb | 107 +++++++++++++++++++++---- 5 files changed, 143 insertions(+), 27 deletions(-) diff --git a/lib/datadog/ci/ext/transport.rb b/lib/datadog/ci/ext/transport.rb index 2e64e62f..b0956acc 100644 --- a/lib/datadog/ci/ext/transport.rb +++ b/lib/datadog/ci/ext/transport.rb @@ -12,6 +12,7 @@ module Transport HEADER_CONTENT_ENCODING = "Content-Encoding" HEADER_EVP_SUBDOMAIN = "X-Datadog-EVP-Subdomain" HEADER_CONTAINER_ID = "Datadog-Container-ID" + HEADER_RATELIMIT_RESET = "X-RateLimit-Reset" EVP_PROXY_V2_PATH_PREFIX = "/evp_proxy/v2/" EVP_PROXY_V4_PATH_PREFIX = "/evp_proxy/v4/" diff --git a/lib/datadog/ci/transport/http.rb b/lib/datadog/ci/transport/http.rb index 00f0c118..3752872e 100644 --- a/lib/datadog/ci/transport/http.rb +++ b/lib/datadog/ci/transport/http.rb @@ -23,6 +23,7 @@ class HTTP DEFAULT_TIMEOUT = 30 MAX_RETRIES = 3 INITIAL_BACKOFF = 1 + MAX_BACKOFF = 30 def initialize(host:, port:, timeout: DEFAULT_TIMEOUT, ssl: true, compress: false) @host = host @@ -78,21 +79,53 @@ def request( private def perform_http_call(path:, payload:, headers:, verb:, retries: MAX_RETRIES, backoff: INITIAL_BACKOFF) - adapter.call( - path: path, payload: payload, headers: headers, verb: verb - ) - rescue Timeout::Error, Errno::EINVAL, Errno::ECONNRESET, EOFError, SocketError, Net::HTTPBadResponse => e - Datadog.logger.debug("Failed to send request with #{e} (#{e.message})") - - if retries.positive? - sleep(backoff) + # retry logic as lambda to avoid duplication + retry_request = ->(previous_response, current_backoff) do + if retries.positive? && backoff <= MAX_BACKOFF + sleep(backoff) + + perform_http_call( + path: path, + payload: payload, + headers: headers, + verb: verb, + retries: retries - 1, + backoff: current_backoff * 2 + ) + else + Datadog.logger.error( + "Failed to send request after #{MAX_RETRIES - retries} retries (current backoff value #{backoff})" + ) + + previous_response + end + end - perform_http_call( - path: path, payload: payload, headers: headers, verb: verb, retries: retries - 1, backoff: backoff * 2 + begin + response = adapter.call( + path: path, payload: payload, headers: headers, verb: verb ) - else - Datadog.logger.error("Failed to send request after #{MAX_RETRIES} retries") - ErrorResponse.new(e) + return response if response.ok? + + if response.code == 429 + backoff = (response.header(Ext::Transport::HEADER_RATELIMIT_RESET) || 1).to_i + + Datadog.logger.debug do + "Received rate limit response, retrying in #{backoff} seconds from X-RateLimit-Reset header" + end + + retry_request.call(response, backoff) + elsif response.server_error? + Datadog.logger.debug { "Received server error response, retrying in #{backoff} seconds" } + + retry_request.call(response, backoff) + else + response + end + rescue Timeout::Error, Errno::EINVAL, Errno::ECONNRESET, EOFError, SocketError, Net::HTTPBadResponse => e + Datadog.logger.debug { "Failed to send request with #{e} (#{e.message})" } + + retry_request.call(ErrorResponse.new(e), backoff) end end diff --git a/sig/datadog/ci/ext/transport.rbs b/sig/datadog/ci/ext/transport.rbs index b0ca5de2..7ccca67b 100644 --- a/sig/datadog/ci/ext/transport.rbs +++ b/sig/datadog/ci/ext/transport.rbs @@ -16,6 +16,8 @@ module Datadog HEADER_CONTAINER_ID: "Datadog-Container-ID" + HEADER_RATELIMIT_RESET: "X-RateLimit-Reset" + EVP_PROXY_V2_PATH_PREFIX: "/evp_proxy/v2/" EVP_PROXY_V4_PATH_PREFIX: "/evp_proxy/v4/" diff --git a/sig/datadog/ci/transport/http.rbs b/sig/datadog/ci/transport/http.rbs index 4548ac33..c709a6ef 100644 --- a/sig/datadog/ci/transport/http.rbs +++ b/sig/datadog/ci/transport/http.rbs @@ -13,6 +13,7 @@ module Datadog DEFAULT_TIMEOUT: 30 MAX_RETRIES: 3 INITIAL_BACKOFF: 1 + MAX_BACKOFF: 30 def initialize: (host: String, port: Integer, ?ssl: bool, ?timeout: Integer, ?compress: bool) -> void diff --git a/spec/datadog/ci/transport/http_spec.rb b/spec/datadog/ci/transport/http_spec.rb index 72055858..be6c5c7a 100644 --- a/spec/datadog/ci/transport/http_spec.rb +++ b/spec/datadog/ci/transport/http_spec.rb @@ -164,7 +164,7 @@ context "when request fails" do let(:request_options) { {backoff: 0} } - context "when server returns error status code" do + context "when server returns 400" do let(:net_http_response) { double("Net::HTTP::Response", code: 400, body: "error", "[]": nil) } before do @@ -178,27 +178,106 @@ end end - context "when succeeds after retries" do - before do - expect(adapter).to receive(:call).and_raise(Errno::ECONNRESET).exactly(described_class::MAX_RETRIES).times - expect(adapter).to receive(:call).and_return(http_response) + context "when server returns 429" do + let(:no_backoff) { "0" } + let(:response_429_no_backoff) do + Datadog::CI::Transport::Adapters::Net::Response.new( + double("Net::HTTP::Response", code: 429, body: "error", "[]": no_backoff) + ) end - it "produces a response" do - is_expected.to be_a_kind_of(Datadog::CI::Transport::Adapters::Net::Response) + context "when backoff increases" do + let(:high_backoff) { "31" } + let(:response_429_high_backoff) do + Datadog::CI::Transport::Adapters::Net::Response.new( + double("Net::HTTP::Response", code: 429, body: "error", "[]": high_backoff) + ) + end + + before do + expect(adapter).to receive(:call).and_return(response_429_no_backoff).once + expect(adapter).to receive(:call).and_return(response_429_high_backoff).once + end + + it "retries until backoff is over 30, afterwards returns failed response" do + is_expected.to be_a_kind_of(Datadog::CI::Transport::Adapters::Net::Response) + + expect(response.code).to eq(429) + end + end + + context "when the call eventually succeeds" do + before do + expect(adapter).to( + receive(:call).and_return(response_429_no_backoff).exactly(described_class::MAX_RETRIES).times + ) + expect(adapter).to receive(:call).and_return(http_response).once + end + + it "produces a response" do + is_expected.to be_a_kind_of(Datadog::CI::Transport::Adapters::Net::Response) - expect(response.code).to eq(200) + expect(response.code).to eq(200) + end end end - context "when retries are exhausted" do - before do - expect(adapter).to receive(:call).and_raise(Errno::ECONNRESET).exactly(described_class::MAX_RETRIES + 1).times + context "when server returns 503" do + let(:response_503) do + Datadog::CI::Transport::Adapters::Net::Response.new( + double("Net::HTTP::Response", code: 503, body: "error", "[]": nil) + ) + end + + context "when succeeds after retries" do + before do + expect(adapter).to receive(:call).and_return(response_503).exactly(described_class::MAX_RETRIES).times + expect(adapter).to receive(:call).and_return(http_response) + end + + it "produces a response" do + is_expected.to be_a_kind_of(Datadog::CI::Transport::Adapters::Net::Response) + + expect(response.code).to eq(200) + end + end + + context "when retries are exhausted" do + before do + expect(adapter).to receive(:call).and_return(response_503).exactly(described_class::MAX_RETRIES + 1).times + end + + it "returns failed response" do + is_expected.to be_a_kind_of(Datadog::CI::Transport::Adapters::Net::Response) + + expect(response.code).to eq(503) + end end + end + + context "when network connection fails" do + context "when succeeds after retries" do + before do + expect(adapter).to receive(:call).and_raise(Errno::ECONNRESET).exactly(described_class::MAX_RETRIES).times + expect(adapter).to receive(:call).and_return(http_response) + end + + it "produces a response" do + is_expected.to be_a_kind_of(Datadog::CI::Transport::Adapters::Net::Response) + + expect(response.code).to eq(200) + end + end + + context "when retries are exhausted" do + before do + expect(adapter).to receive(:call).and_raise(Errno::ECONNRESET).exactly(described_class::MAX_RETRIES + 1).times + end - it "returns ErrorRsponse" do - expect(response.error).to be_kind_of(Errno::ECONNRESET) - expect(response.telemetry_error_type).to eq(Datadog::CI::Ext::Telemetry::ErrorType::NETWORK) + it "returns ErrorRsponse" do + expect(response.error).to be_kind_of(Errno::ECONNRESET) + expect(response.telemetry_error_type).to eq(Datadog::CI::Ext::Telemetry::ErrorType::NETWORK) + end end end end