matrix-org · clokep · Oct 17, 2023 · Oct 16, 2023 · Oct 16, 2023 · Oct 16, 2023
@@ -0,0 +1 @@
+Fix a bug introduced in Synapse 1.59.0 where servers would be incorrectly marked as available when a request resulted in an error.
@@ -170,10 +170,10 @@ def __init__(
                 database in milliseconds, or zero if the last request was
                 successful.
             backoff_on_404: Back off if we get a 404
-
             backoff_on_failure: set to False if we should not increase the
                 retry interval on a failure.
-
+            notifier: A notifier used to mark servers as up.
+            replication_client A replication client used to mark servers as up.
             backoff_on_all_error_codes: Whether we should back off on any
                 error code.
         """
@@ -237,6 +237,9 @@ def __exit__(
             else:
                 valid_err_code = False
 
+        # Store whether the destination had previously been failing.
+        previously_failing = bool(self.failure_ts)
+
         if success:
             # We connected successfully.
             if not self.retry_interval:
@@ -291,17 +294,15 @@ async def store_retry_timings() -> None:
                     self.retry_interval,
                 )
 
-                if self.notifier:
-                    # Inform the relevant places that the remote server is back up.
-                    self.notifier.notify_remote_server_up(self.destination)
-
-                if self.replication_client:
-                    # If we're on a worker we try and inform master about this. The
-                    # replication client doesn't hook into the notifier to avoid
-                    # infinite loops where we send a `REMOTE_SERVER_UP` command to
-                    # master, which then echoes it back to us which in turn pokes
-                    # the notifier.
-                    self.replication_client.send_remote_server_up(self.destination)
+                # If the server was previously failing, but is no longer.
+                if previously_failing:
+                    if self.notifier:
+                        # Inform the relevant places that the remote server is back up.
+                        self.notifier.notify_remote_server_up(self.destination)
+
+                    if self.replication_client:
+                        # Inform other workers that the remote server is up.
+                        self.replication_client.send_remote_server_up(self.destination)
 
             except Exception:
                 logger.exception("Failed to store destination_retry_timings")

@@ -11,6 +11,10 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from unittest import mock
+
+from synapse.notifier import Notifier
+from synapse.replication.tcp.handler import ReplicationCommandHandler
 from synapse.util.retryutils import NotRetryingDestination, get_retry_limiter
 
 from tests.unittest import HomeserverTestCase
@@ -109,6 +113,61 @@ def test_limiter(self) -> None:
         new_timings = self.get_success(store.get_destination_retry_timings("test_dest"))
         self.assertIsNone(new_timings)
 
+    def test_notifier_replcation(self) -> None:
+        """Ensure the notifier/replication client is called only when expected."""
+        store = self.hs.get_datastores().main
+
+        notifier = mock.Mock(spec=Notifier)
+        replication_client = mock.Mock(spec=ReplicationCommandHandler)
+
+        limiter = self.get_success(
+            get_retry_limiter(
+                "test_dest",
+                self.clock,
+                store,
+                notifier=notifier,
+                replication_client=replication_client,
+            )
+        )
+
+        # The server is already up, nothing should occur.
+        self.pump(1)
+        with limiter:
+            pass
+        self.pump()
+
+        new_timings = self.get_success(store.get_destination_retry_timings("test_dest"))
+        self.assertIsNone(new_timings)
+        notifier.notify_remote_server_up.assert_not_called()
+        replication_client.send_remote_server_up.assert_not_called()
+
+        # Attempt again, but return an error. This will cause new retry timings.
+        self.pump(1)
+        try:
+            with limiter:
+                raise AssertionError("argh")
+        except AssertionError:
+            pass
+        self.pump()
+
+        new_timings = self.get_success(store.get_destination_retry_timings("test_dest"))
+        # The exact retry timings are tested separately.
+        self.assertIsNotNone(new_timings)
+        notifier.notify_remote_server_up.assert_not_called()
+        replication_client.send_remote_server_up.assert_not_called()
+
+        # One more attempt, successfully this time.
+        self.pump(1)
+        with limiter:
+            pass
+        self.pump()
+
+        new_timings = self.get_success(store.get_destination_retry_timings("test_dest"))
+        # The exact retry timings are tested separately.
+        self.assertIsNone(new_timings)
+        notifier.notify_remote_server_up.assert_called_once_with("test_dest")
+        replication_client.send_remote_server_up.assert_called_once_with("test_dest")
+
     def test_max_retry_interval(self) -> None:
         """Test that `destination_max_retry_interval` setting works as expected"""
         store = self.hs.get_datastores().main