Skip to content
This repository has been archived by the owner on Apr 26, 2024. It is now read-only.

Fix a bug where servers could be marked as up when they were failing #16506

Merged
merged 5 commits into from
Oct 17, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions changelog.d/16506.bugfix
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Fix a bug introduced in Synapse 1.59.0 where servers would be incorrectly marked as available when a request resulted in an error.
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not sure if there's a more visible symptom? Perhaps it would cause things to be retried too often?

30 changes: 17 additions & 13 deletions synapse/util/retryutils.py
Original file line number Diff line number Diff line change
Expand Up @@ -170,10 +170,10 @@ def __init__(
database in milliseconds, or zero if the last request was
successful.
backoff_on_404: Back off if we get a 404

backoff_on_failure: set to False if we should not increase the
retry interval on a failure.

notifier: A notifier used to mark servers as up.
replication_client A replication client used to mark servers as up.
backoff_on_all_error_codes: Whether we should back off on any
error code.
"""
Expand Down Expand Up @@ -237,6 +237,9 @@ def __exit__(
else:
valid_err_code = False

# Whether previous requests to the destination had been failing.
previously_failing = bool(self.failure_ts)

if success:
# We connected successfully.
if not self.retry_interval:
Expand Down Expand Up @@ -282,6 +285,9 @@ def __exit__(
if self.failure_ts is None:
self.failure_ts = retry_last_ts

# Whether the current request to the destination had been failing.
currently_failing = bool(self.failure_ts)

async def store_retry_timings() -> None:
try:
await self.store.set_destination_retry_timings(
Expand All @@ -291,17 +297,15 @@ async def store_retry_timings() -> None:
self.retry_interval,
)

if self.notifier:
# Inform the relevant places that the remote server is back up.
self.notifier.notify_remote_server_up(self.destination)

if self.replication_client:
# If we're on a worker we try and inform master about this. The
# replication client doesn't hook into the notifier to avoid
# infinite loops where we send a `REMOTE_SERVER_UP` command to
# master, which then echoes it back to us which in turn pokes
# the notifier.
self.replication_client.send_remote_server_up(self.destination)
# If the server was previously failing, but is no longer.
if previously_failing and not currently_failing:
if self.notifier:
# Inform the relevant places that the remote server is back up.
self.notifier.notify_remote_server_up(self.destination)

if self.replication_client:
# Inform other workers that the remote server is up.
self.replication_client.send_remote_server_up(self.destination)

except Exception:
logger.exception("Failed to store destination_retry_timings")
Expand Down
75 changes: 75 additions & 0 deletions tests/util/test_retryutils.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,10 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from unittest import mock

from synapse.notifier import Notifier
from synapse.replication.tcp.handler import ReplicationCommandHandler
from synapse.util.retryutils import NotRetryingDestination, get_retry_limiter

from tests.unittest import HomeserverTestCase
Expand Down Expand Up @@ -109,6 +113,77 @@ def test_limiter(self) -> None:
new_timings = self.get_success(store.get_destination_retry_timings("test_dest"))
self.assertIsNone(new_timings)

def test_notifier_replication(self) -> None:
"""Ensure the notifier/replication client is called only when expected."""
store = self.hs.get_datastores().main

notifier = mock.Mock(spec=Notifier)
replication_client = mock.Mock(spec=ReplicationCommandHandler)

limiter = self.get_success(
get_retry_limiter(
"test_dest",
self.clock,
store,
notifier=notifier,
replication_client=replication_client,
)
)

# The server is already up, nothing should occur.
self.pump(1)
with limiter:
pass
self.pump()

new_timings = self.get_success(store.get_destination_retry_timings("test_dest"))
self.assertIsNone(new_timings)
notifier.notify_remote_server_up.assert_not_called()
replication_client.send_remote_server_up.assert_not_called()

# Attempt again, but return an error. This will cause new retry timings, but
# should not trigger server up notifications.
self.pump(1)
try:
with limiter:
raise AssertionError("argh")
except AssertionError:
pass
self.pump()

new_timings = self.get_success(store.get_destination_retry_timings("test_dest"))
# The exact retry timings are tested separately.
self.assertIsNotNone(new_timings)
notifier.notify_remote_server_up.assert_not_called()
replication_client.send_remote_server_up.assert_not_called()

# A second failing request should be treated as the above.
self.pump(1)
try:
with limiter:
raise AssertionError("argh")
except AssertionError:
pass
self.pump()

new_timings = self.get_success(store.get_destination_retry_timings("test_dest"))
# The exact retry timings are tested separately.
self.assertIsNotNone(new_timings)
notifier.notify_remote_server_up.assert_not_called()
replication_client.send_remote_server_up.assert_not_called()

# A final successful attempt should generate a server up notification.
self.pump(1)
with limiter:
pass
self.pump()

new_timings = self.get_success(store.get_destination_retry_timings("test_dest"))
# The exact retry timings are tested separately.
self.assertIsNone(new_timings)
notifier.notify_remote_server_up.assert_called_once_with("test_dest")
replication_client.send_remote_server_up.assert_called_once_with("test_dest")

def test_max_retry_interval(self) -> None:
"""Test that `destination_max_retry_interval` setting works as expected"""
store = self.hs.get_datastores().main
Expand Down
Loading