diff --git a/changelog.d/13541.misc b/changelog.d/13541.misc new file mode 100644 index 000000000000..b488bf74c389 --- /dev/null +++ b/changelog.d/13541.misc @@ -0,0 +1 @@ +Add metrics to track how the rate limiter is affecting requests (sleep/reject). diff --git a/synapse/util/ratelimitutils.py b/synapse/util/ratelimitutils.py index 434b02b97b9d..724d39b92fb2 100644 --- a/synapse/util/ratelimitutils.py +++ b/synapse/util/ratelimitutils.py @@ -30,7 +30,7 @@ run_in_background, ) from synapse.logging.opentracing import start_active_span -from synapse.metrics import Histogram +from synapse.metrics import Histogram, LaterGauge from synapse.util import Clock if typing.TYPE_CHECKING: @@ -74,6 +74,27 @@ def new_limiter() -> "_PerHostRatelimiter": str, "_PerHostRatelimiter" ] = collections.defaultdict(new_limiter) + # We track the number of affected hosts per time-period so we can + # differentiate one really noisy homeserver from a general + # ratelimit tuning problem across the federation. + LaterGauge( + "synapse_rate_limit_sleep_affected_hosts", + "Number of hosts that had requests put to sleep", + [], + lambda: sum( + ratelimiter.should_sleep() for ratelimiter in self.ratelimiters.values() + ), + ) + LaterGauge( + "synapse_rate_limit_reject_affected_hosts", + "Number of hosts that had requests rejected", + [], + lambda: sum( + ratelimiter.should_reject() + for ratelimiter in self.ratelimiters.values() + ), + ) + def ratelimit(self, host: str) -> "_GeneratorContextManager[defer.Deferred[None]]": """Used to ratelimit an incoming request from a given host @@ -139,6 +160,21 @@ def ratelimit(self, host: str) -> "Iterator[defer.Deferred[None]]": finally: self._on_exit(request_id) + def should_reject(self) -> bool: + """ + Whether to reject the request if we already have too many queued up + (either sleeping or in the ready queue). + """ + queue_size = len(self.ready_request_queue) + len(self.sleeping_requests) + return queue_size > self.reject_limit + + def should_sleep(self) -> bool: + """ + Whether to sleep the request if we already have too many requests coming + through within the window. + """ + return len(self.request_times) > self.sleep_limit + def _on_enter(self, request_id: object) -> "defer.Deferred[None]": time_now = self.clock.time_msec() @@ -149,8 +185,7 @@ def _on_enter(self, request_id: object) -> "defer.Deferred[None]": # reject the request if we already have too many queued up (either # sleeping or in the ready queue). - queue_size = len(self.ready_request_queue) + len(self.sleeping_requests) - if queue_size > self.reject_limit: + if self.should_reject(): logger.debug("Ratelimiter(%s): rejecting request", self.host) rate_limit_reject_counter.inc() raise LimitExceededError( @@ -180,7 +215,7 @@ def queue_request() -> "defer.Deferred[None]": len(self.request_times), ) - if len(self.request_times) > self.sleep_limit: + if self.should_sleep(): logger.debug( "Ratelimiter(%s) [%s]: sleeping request for %f sec", self.host,