Skip to content
This repository has been archived by the owner on Apr 26, 2024. It is now read-only.

Instrument FederationStateIdsServlet - /state_ids #13499

Merged
merged 4 commits into from
Aug 15, 2022
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions changelog.d/13499.misc
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Instrument `FederationStateIdsServlet` (`/state_ids`) for understandable traces in Jaeger.
11 changes: 10 additions & 1 deletion synapse/federation/federation_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,12 @@
nested_logging_context,
run_in_background,
)
from synapse.logging.opentracing import log_kv, start_active_span_from_edu, trace
from synapse.logging.opentracing import (
log_kv,
start_active_span_from_edu,
tag_args,
trace,
)
from synapse.metrics.background_process_metrics import wrap_as_background_process
from synapse.replication.http.federation import (
ReplicationFederationSendEduRestServlet,
Expand Down Expand Up @@ -547,6 +552,8 @@ async def on_room_state_request(

return 200, resp

@trace
@tag_args
async def on_state_ids_request(
self, origin: str, room_id: str, event_id: str
) -> Tuple[int, JsonDict]:
Expand All @@ -569,6 +576,8 @@ async def on_state_ids_request(

return 200, resp

@trace
@tag_args
async def _on_state_ids_request_compute(
self, room_id: str, event_id: str
) -> JsonDict:
Expand Down
4 changes: 3 additions & 1 deletion synapse/handlers/federation.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@
from synapse.federation.federation_client import InvalidResponseError
from synapse.http.servlet import assert_params_in_dict
from synapse.logging.context import nested_logging_context
from synapse.logging.opentracing import trace
from synapse.logging.opentracing import tag_args, trace
from synapse.metrics.background_process_metrics import run_as_background_process
from synapse.module_api import NOT_SPAM
from synapse.replication.http.federation import (
Expand Down Expand Up @@ -1081,6 +1081,8 @@ async def on_make_knock_request(

return event

@trace
@tag_args
async def get_state_ids_for_pdu(self, room_id: str, event_id: str) -> List[str]:
"""Returns the state at the event. i.e. not including said event."""
event = await self.store.get_event(event_id, check_room_id=room_id)
Expand Down
3 changes: 3 additions & 0 deletions synapse/storage/databases/main/event_federation.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
from synapse.api.errors import StoreError
from synapse.api.room_versions import EventFormatVersions, RoomVersion
from synapse.events import EventBase, make_event_from_dict
from synapse.logging.opentracing import tag_args, trace
from synapse.metrics.background_process_metrics import wrap_as_background_process
from synapse.storage._base import SQLBaseStore, db_to_json, make_in_list_sql_clause
from synapse.storage.database import (
Expand Down Expand Up @@ -126,6 +127,8 @@ async def get_auth_chain(
)
return await self.get_events_as_list(event_ids)

@trace
@tag_args
async def get_auth_chain_ids(
self,
room_id: str,
Expand Down
5 changes: 5 additions & 0 deletions synapse/util/ratelimitutils.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
make_deferred_yieldable,
run_in_background,
)
from synapse.logging.opentracing import start_active_span
from synapse.util import Clock

if typing.TYPE_CHECKING:
Expand Down Expand Up @@ -110,6 +111,9 @@ def ratelimit(self) -> "Iterator[defer.Deferred[None]]":
def _on_enter(self, request_id: object) -> "defer.Deferred[None]":
time_now = self.clock.time_msec()

wait_span_scope = start_active_span("ratelimit wait")
wait_span_scope.__enter__()
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This can make a big difference in understanding where all of the time went. In the before, there is just a giant gap with no indication of what's taking up the time. In the after, you can see the ratelimit wait span 🎉

Before After


# remove any entries from request_times which aren't within the window
self.request_times[:] = [
r for r in self.request_times if time_now - r < self.window_size
Expand Down Expand Up @@ -162,6 +166,7 @@ def on_wait_finished(_: Any) -> "defer.Deferred[None]":

def on_start(r: object) -> object:
logger.debug("Ratelimit [%s]: Processing req", id(request_id))
wait_span_scope.__exit__(None, None, None)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm a bit concerned that this approach could lead to us forgetting to exit the scope in some situations (particularly: on cancellation, or other exceptions such as the place where we raise LimitExceededError).

one approach we've taken elsewhere is to make the span cover the whole of the limited operation (ie, the wait, and the operation itself), and just emit a tracing event when the limiter completes (or have a nested span). That means you can just do a with start_active_span(...) as normal.

Failing that, I'd suggest doing this in on_both instead of on_start, and moving the call to start_active_span to just before the definition of on_both, to minimise the amount of code that can throw exceptions between the two.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

one approach we've taken elsewhere is to make the span cover the whole of the limited operation (ie, the wait, and the operation itself), and just emit a tracing event when the limiter completes (or have a nested span). That means you can just do a with start_active_span(...) as normal.

We already have a span that covers the wait and the operation itself. I'm interested in exposing how long the wait is though.


Failing that, I'd suggest doing this in on_both instead of on_start, and moving the call to start_active_span to just before the definition of on_both, to minimise the amount of code that can throw exceptions between the two.

Moved to this 👍

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm interested in exposing how long the wait is though.

doesn't emitting a tracing event do that though?

Moved to this

ok then :)

self.current_processing.add(request_id)
return r

Expand Down