Skip to content
This repository has been archived by the owner on Apr 26, 2024. It is now read-only.

Don't reset retry timers on "valid" error codes #16221

Merged
merged 5 commits into from
Sep 4, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions changelog.d/16221.bugfix
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Fix long-standing bug where we did not correctly back off from servers that had "gone" if they returned 4xx series error codes.
4 changes: 3 additions & 1 deletion synapse/federation/transport/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -249,8 +249,10 @@ async def send_transaction(
data=json_data,
json_data_callback=json_data_callback,
long_retries=True,
backoff_on_404=True, # If we get a 404 the other side has gone
try_trailing_slash_on_400=True,
# Sending a transaction should always succeed, if it doesn't
# then something is wrong and we should backoff.
backoff_on_all_error_codes=True,
)

async def make_query(
Expand Down
8 changes: 8 additions & 0 deletions synapse/http/matrixfederationclient.py
Original file line number Diff line number Diff line change
Expand Up @@ -512,6 +512,7 @@ async def _send_request(
long_retries: bool = False,
ignore_backoff: bool = False,
backoff_on_404: bool = False,
backoff_on_all_error_codes: bool = False,
) -> IResponse:
"""
Sends a request to the given server.
Expand Down Expand Up @@ -552,6 +553,7 @@ async def _send_request(
and try the request anyway.

backoff_on_404: Back off if we get a 404
backoff_on_all_error_codes: Back off if we get any error response

Returns:
Resolves with the HTTP response object on success.
Expand Down Expand Up @@ -594,6 +596,7 @@ async def _send_request(
ignore_backoff=ignore_backoff,
notifier=self.hs.get_notifier(),
replication_client=self.hs.get_replication_command_handler(),
backoff_on_all_error_codes=backoff_on_all_error_codes,
)

method_bytes = request.method.encode("ascii")
Expand Down Expand Up @@ -889,6 +892,7 @@ async def put_json(
backoff_on_404: bool = False,
try_trailing_slash_on_400: bool = False,
parser: Literal[None] = None,
backoff_on_all_error_codes: bool = False,
) -> JsonDict:
...

Expand All @@ -906,6 +910,7 @@ async def put_json(
backoff_on_404: bool = False,
try_trailing_slash_on_400: bool = False,
parser: Optional[ByteParser[T]] = None,
backoff_on_all_error_codes: bool = False,
) -> T:
...

Expand All @@ -922,6 +927,7 @@ async def put_json(
backoff_on_404: bool = False,
try_trailing_slash_on_400: bool = False,
parser: Optional[ByteParser[T]] = None,
backoff_on_all_error_codes: bool = False,
) -> Union[JsonDict, T]:
"""Sends the specified json data using PUT

Expand Down Expand Up @@ -957,6 +963,7 @@ async def put_json(
enabled.
parser: The parser to use to decode the response. Defaults to
parsing as JSON.
backoff_on_all_error_codes: Back off if we get any error response

Returns:
Succeeds when we get a 2xx HTTP response. The
Expand Down Expand Up @@ -990,6 +997,7 @@ async def put_json(
ignore_backoff=ignore_backoff,
long_retries=long_retries,
timeout=timeout,
backoff_on_all_error_codes=backoff_on_all_error_codes,
)

if timeout is not None:
Expand Down
18 changes: 16 additions & 2 deletions synapse/util/retryutils.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,7 @@ def __init__(
backoff_on_failure: bool = True,
notifier: Optional["Notifier"] = None,
replication_client: Optional["ReplicationCommandHandler"] = None,
backoff_on_all_error_codes: bool = False,
):
"""Marks the destination as "down" if an exception is thrown in the
context, except for CodeMessageException with code < 500.
Expand All @@ -147,6 +148,9 @@ def __init__(

backoff_on_failure: set to False if we should not increase the
retry interval on a failure.

backoff_on_all_error_codes: Whether we should back off on any
error code.
"""
self.clock = clock
self.store = store
Expand All @@ -156,6 +160,7 @@ def __init__(
self.retry_interval = retry_interval
self.backoff_on_404 = backoff_on_404
self.backoff_on_failure = backoff_on_failure
self.backoff_on_all_error_codes = backoff_on_all_error_codes

self.notifier = notifier
self.replication_client = replication_client
Expand All @@ -179,6 +184,7 @@ def __exit__(
exc_val: Optional[BaseException],
exc_tb: Optional[TracebackType],
) -> None:
success = exc_type is None
valid_err_code = False
if exc_type is None:
valid_err_code = True
Expand All @@ -195,7 +201,9 @@ def __exit__(
# won't accept our requests for at least a while.
# 429 is us being aggressively rate limited, so lets rate limit
# ourselves.
if exc_val.code == 404 and self.backoff_on_404:
if self.backoff_on_all_error_codes:
valid_err_code = False
elif exc_val.code == 404 and self.backoff_on_404:
valid_err_code = False
elif exc_val.code in (401, 429):
valid_err_code = False
Expand All @@ -204,7 +212,7 @@ def __exit__(
else:
valid_err_code = False

if valid_err_code:
if success:
# We connected successfully.
if not self.retry_interval:
return
Expand All @@ -215,6 +223,12 @@ def __exit__(
self.failure_ts = None
retry_last_ts = 0
self.retry_interval = 0
elif valid_err_code:
# We got a potentially valid error code back. We don't reset the
# timers though, as the other side might actually be down anyway
# (e.g. some deprovisioned servers will always return a 404 or 403,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Aside: would be nice if decommissioned servers could send us 410 Gone so that we would never bother to retry sending to them (unless they send us traffic in the future).

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Aside: would be nice if decommissioned servers could send us 410 Gone so that we would never bother to retry sending to them (unless they send us traffic in the future).

Now this would be awesome best "workaround" for servers that were "pulled the plug unsafely" and have no means/desires to build matrix server again but wants to be good citizen after the fact. Simple idea, helps what it can.

# and we don't want to keep resetting the retry timers for them).
return
elif not self.backoff_on_failure:
return
else:
Expand Down
4 changes: 2 additions & 2 deletions tests/handlers/test_typing.py
Original file line number Diff line number Diff line change
Expand Up @@ -251,8 +251,8 @@ def test_started_typing_remote_send(self) -> None:
),
json_data_callback=ANY,
long_retries=True,
backoff_on_404=True,
try_trailing_slash_on_400=True,
backoff_on_all_error_codes=True,
)

def test_started_typing_remote_recv(self) -> None:
Expand Down Expand Up @@ -366,7 +366,7 @@ def test_stopped_typing(self) -> None:
),
json_data_callback=ANY,
long_retries=True,
backoff_on_404=True,
backoff_on_all_error_codes=True,
try_trailing_slash_on_400=True,
)

Expand Down