Skip to content
This repository has been archived by the owner on Apr 26, 2024. It is now read-only.

Fix a bug where servers could be marked as up when they were failing #16506

Merged
merged 5 commits into from
Oct 17, 2023
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions changelog.d/16506.bugfix
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Fix a bug introduced in Synapse 1.59.0 where servers would be incorrectly marked as available when a request resulted in an error.
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not sure if there's a more visible symptom? Perhaps it would cause things to be retried too often?

27 changes: 14 additions & 13 deletions synapse/util/retryutils.py
Original file line number Diff line number Diff line change
Expand Up @@ -170,10 +170,10 @@ def __init__(
database in milliseconds, or zero if the last request was
successful.
backoff_on_404: Back off if we get a 404

backoff_on_failure: set to False if we should not increase the
retry interval on a failure.

notifier: A notifier used to mark servers as up.
replication_client A replication client used to mark servers as up.
backoff_on_all_error_codes: Whether we should back off on any
error code.
"""
Expand Down Expand Up @@ -237,6 +237,9 @@ def __exit__(
else:
valid_err_code = False

# Store whether the destination had previously been failing.
previously_failing = bool(self.failure_ts)

if success:
# We connected successfully.
if not self.retry_interval:
Expand Down Expand Up @@ -291,17 +294,15 @@ async def store_retry_timings() -> None:
self.retry_interval,
)

if self.notifier:
# Inform the relevant places that the remote server is back up.
self.notifier.notify_remote_server_up(self.destination)

if self.replication_client:
# If we're on a worker we try and inform master about this. The
# replication client doesn't hook into the notifier to avoid
# infinite loops where we send a `REMOTE_SERVER_UP` command to
# master, which then echoes it back to us which in turn pokes
# the notifier.
self.replication_client.send_remote_server_up(self.destination)
# If the server was previously failing, but is no longer.
if previously_failing:
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@erikjohnston this might need some thoughts from you as the original author of #12500 -- was this done on purpose and I'm missing some understanding?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Actually, I think this is not quite right still, it will end up calling this code if we were previously failing & still failing. I think?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Actually, I think this is not quite right still, it will end up calling this code if we were previously failing & still failing. I think?

It should be OK now. 👍

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The logic could be simplified to only check not currently_failing, which depends on the earlier return to not kick off the background process. But I find this a bit clearer to include previously_failing and not currently_failing. 🤷

if self.notifier:
# Inform the relevant places that the remote server is back up.
self.notifier.notify_remote_server_up(self.destination)

if self.replication_client:
# Inform other workers that the remote server is up.
self.replication_client.send_remote_server_up(self.destination)

except Exception:
logger.exception("Failed to store destination_retry_timings")
Expand Down
59 changes: 59 additions & 0 deletions tests/util/test_retryutils.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,10 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from unittest import mock

from synapse.notifier import Notifier
from synapse.replication.tcp.handler import ReplicationCommandHandler
from synapse.util.retryutils import NotRetryingDestination, get_retry_limiter

from tests.unittest import HomeserverTestCase
Expand Down Expand Up @@ -109,6 +113,61 @@ def test_limiter(self) -> None:
new_timings = self.get_success(store.get_destination_retry_timings("test_dest"))
self.assertIsNone(new_timings)

def test_notifier_replcation(self) -> None:
clokep marked this conversation as resolved.
Show resolved Hide resolved
"""Ensure the notifier/replication client is called only when expected."""
store = self.hs.get_datastores().main

notifier = mock.Mock(spec=Notifier)
replication_client = mock.Mock(spec=ReplicationCommandHandler)

limiter = self.get_success(
get_retry_limiter(
"test_dest",
self.clock,
store,
notifier=notifier,
replication_client=replication_client,
)
)

# The server is already up, nothing should occur.
self.pump(1)
with limiter:
pass
self.pump()

new_timings = self.get_success(store.get_destination_retry_timings("test_dest"))
self.assertIsNone(new_timings)
notifier.notify_remote_server_up.assert_not_called()
replication_client.send_remote_server_up.assert_not_called()

# Attempt again, but return an error. This will cause new retry timings.
self.pump(1)
try:
with limiter:
raise AssertionError("argh")
except AssertionError:
pass
self.pump()

new_timings = self.get_success(store.get_destination_retry_timings("test_dest"))
# The exact retry timings are tested separately.
self.assertIsNotNone(new_timings)
notifier.notify_remote_server_up.assert_not_called()
replication_client.send_remote_server_up.assert_not_called()

# One more attempt, successfully this time.
self.pump(1)
with limiter:
pass
self.pump()

new_timings = self.get_success(store.get_destination_retry_timings("test_dest"))
# The exact retry timings are tested separately.
self.assertIsNone(new_timings)
notifier.notify_remote_server_up.assert_called_once_with("test_dest")
replication_client.send_remote_server_up.assert_called_once_with("test_dest")

def test_max_retry_interval(self) -> None:
"""Test that `destination_max_retry_interval` setting works as expected"""
store = self.hs.get_datastores().main
Expand Down
Loading