Skip to content
This repository has been archived by the owner on Apr 26, 2024. It is now read-only.

Stop getting missing prev_events after we already know their signature is invalid #13816

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
40 commits
Select commit Hold shift + click to select a range
924ae2b
Track when the pulled event signature fails
MadLittleMods Sep 14, 2022
d240aeb
Add changelog
MadLittleMods Sep 14, 2022
cfb4e88
Fix reference
MadLittleMods Sep 14, 2022
9b390a3
Stop getting missing prev_events after we already know their signatur…
MadLittleMods Sep 15, 2022
3d8507d
Merge branch 'develop' into madlittlemods/13700-track-when-event-sign…
MadLittleMods Sep 23, 2022
88a75cf
Use callback pattern to record signature failures
MadLittleMods Sep 23, 2022
d29ac0b
Add docstring
MadLittleMods Sep 24, 2022
14e39ee
Record failure from get_pdu and add test
MadLittleMods Sep 24, 2022
7898371
Be more selective about which errors to care about
MadLittleMods Sep 30, 2022
43f1d1a
Merge branch 'develop' into madlittlemods/13700-track-when-event-sign…
MadLittleMods Sep 30, 2022
e32b901
Merge branch 'madlittlemods/13700-track-when-event-signature-fails' i…
MadLittleMods Sep 30, 2022
83feb1b
Merge branch 'develop' into madlittlemods/13700-track-when-event-sign…
MadLittleMods Oct 1, 2022
7d102e8
Merge branch 'develop' into madlittlemods/13700-track-when-event-sign…
MadLittleMods Oct 1, 2022
81410b6
Merge branch 'madlittlemods/13700-track-when-event-signature-fails' i…
MadLittleMods Oct 1, 2022
958fd3b
Merge branch 'develop' into madlittlemods/13622-13700-stop-getting-st…
MadLittleMods Oct 3, 2022
e24db41
Weird name and add tests
MadLittleMods Oct 4, 2022
f853e78
Add changelog
MadLittleMods Oct 4, 2022
43fb6b8
Bail early and fix lints
MadLittleMods Oct 4, 2022
03f23b7
Add integration test with backfill
MadLittleMods Oct 4, 2022
99d3e79
Fix lints and better message
MadLittleMods Oct 4, 2022
f11f5b5
Fix test description
MadLittleMods Oct 4, 2022
6878faa
Fix test descriptions
MadLittleMods Oct 4, 2022
f3b443d
Scratch debug changes
MadLittleMods Oct 5, 2022
d135d41
Stop cascading backoff errors
MadLittleMods Oct 5, 2022
4effca9
Remove scratch changes
MadLittleMods Oct 5, 2022
e3cc054
Use custom FederationPullAttemptBackoffError
MadLittleMods Oct 5, 2022
74f9e03
Fix test to reflect no more cascade
MadLittleMods Oct 5, 2022
0b900e1
Better comments from what I've gathered
MadLittleMods Oct 5, 2022
3cb2826
Clarify which error in comments
MadLittleMods Oct 5, 2022
5f313df
Add some clarification to the test comment
MadLittleMods Oct 5, 2022
02e6bdd
Merge branch 'develop' into madlittlemods/13622-13700-stop-getting-st…
MadLittleMods Oct 12, 2022
7f4fdd2
Not a SynapseError
MadLittleMods Oct 12, 2022
89ffbcb
Make sure usages of _compute_event_context_with_maybe_missing_prevs h…
MadLittleMods Oct 12, 2022
354f682
Remove double "recently"
MadLittleMods Oct 12, 2022
e0b0447
Do the calculation in Python because it's more clear when we need res…
MadLittleMods Oct 12, 2022
e72c4e5
Use built-in select many function
MadLittleMods Oct 12, 2022
4e08039
No need for txn
MadLittleMods Oct 12, 2022
b060652
Rename function to reflect functionality
MadLittleMods Oct 12, 2022
b1a0c1b
Fix test description to make it accurate
MadLittleMods Oct 12, 2022
bccd802
Use more standard string interpolation with `logger`
MadLittleMods Oct 15, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions changelog.d/13816.feature
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Stop fetching missing `prev_events` after we already know their signature is invalid.
21 changes: 21 additions & 0 deletions synapse/api/errors.py
Original file line number Diff line number Diff line change
Expand Up @@ -640,6 +640,27 @@ def get_dict(self) -> "JsonDict":
}


class FederationPullAttemptBackoffError(RuntimeError):
"""
Raised to indicate that we are are deliberately not attempting to pull the given
event over federation because we've already done so recently and are backing off.

Attributes:
event_id: The event_id which we are refusing to pull
message: A custom error message that gives more context
"""

def __init__(self, event_ids: List[str], message: Optional[str]):
self.event_ids = event_ids

if message:
error_message = message
else:
error_message = f"Not attempting to pull event_ids={self.event_ids} because we already tried to pull them recently (backing off)."

super().__init__(error_message)


class HttpResponseException(CodeMessageException):
"""
Represents an HTTP-level failure of an outbound request
Expand Down
16 changes: 16 additions & 0 deletions synapse/handlers/federation.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@
Codes,
FederationDeniedError,
FederationError,
FederationPullAttemptBackoffError,
HttpResponseException,
LimitExceededError,
NotFoundError,
Expand Down Expand Up @@ -1720,7 +1721,22 @@ async def _sync_partial_state_room(
destination, event
)
break
except FederationPullAttemptBackoffError as exc:
# Log a warning about why we failed to process the event (the error message
# for `FederationPullAttemptBackoffError` is pretty good)
logger.warning("_sync_partial_state_room: %s", exc)
# We do not record a failed pull attempt when we backoff fetching a missing
# `prev_event` because not being able to fetch the `prev_events` just means
# we won't be able to de-outlier the pulled event. But we can still use an
# `outlier` in the state/auth chain for another event. So we shouldn't stop
# a downstream event from trying to pull it.
#
# This avoids a cascade of backoff for all events in the DAG downstream from
# one event backoff upstream.
except FederationError as e:
# TODO: We should `record_event_failed_pull_attempt` here,
# see https://github.com/matrix-org/synapse/issues/13700

if attempt == len(destinations) - 1:
# We have tried every remote server for this event. Give up.
# TODO(faster_joins) giving up isn't the right thing to do
Expand Down
31 changes: 31 additions & 0 deletions synapse/handlers/federation_event.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@
AuthError,
Codes,
FederationError,
FederationPullAttemptBackoffError,
HttpResponseException,
RequestSendFailed,
SynapseError,
Expand Down Expand Up @@ -565,6 +566,9 @@ async def update_state_for_partial_state_event(
event: partial-state event to be de-partial-stated

Raises:
FederationPullAttemptBackoffError if we are are deliberately not attempting
to pull the given event over federation because we've already done so
recently and are backing off.
FederationError if we fail to request state from the remote server.
"""
logger.info("Updating state for %s", event.event_id)
Expand Down Expand Up @@ -899,6 +903,18 @@ async def _process_pulled_event(
context,
backfilled=backfilled,
)
except FederationPullAttemptBackoffError as exc:
# Log a warning about why we failed to process the event (the error message
# for `FederationPullAttemptBackoffError` is pretty good)
logger.warning("_process_pulled_event: %s", exc)
# We do not record a failed pull attempt when we backoff fetching a missing
# `prev_event` because not being able to fetch the `prev_events` just means
# we won't be able to de-outlier the pulled event. But we can still use an
# `outlier` in the state/auth chain for another event. So we shouldn't stop
# a downstream event from trying to pull it.
#
# This avoids a cascade of backoff for all events in the DAG downstream from
# one event backoff upstream.
except FederationError as e:
await self._store.record_event_failed_pull_attempt(
event.room_id, event_id, str(e)
Expand Down Expand Up @@ -945,6 +961,9 @@ async def _compute_event_context_with_maybe_missing_prevs(
The event context.

Raises:
FederationPullAttemptBackoffError if we are are deliberately not attempting
to pull the given event over federation because we've already done so
recently and are backing off.
FederationError if we fail to get the state from the remote server after any
missing `prev_event`s.
"""
Expand All @@ -955,6 +974,18 @@ async def _compute_event_context_with_maybe_missing_prevs(
seen = await self._store.have_events_in_timeline(prevs)
missing_prevs = prevs - seen

# If we've already recently attempted to pull this missing event, don't
# try it again so soon. Since we have to fetch all of the prev_events, we can
# bail early here if we find any to ignore.
prevs_to_ignore = await self._store.get_event_ids_to_not_pull_from_backoff(
room_id, missing_prevs
)
erikjohnston marked this conversation as resolved.
Show resolved Hide resolved
if len(prevs_to_ignore) > 0:
raise FederationPullAttemptBackoffError(
MadLittleMods marked this conversation as resolved.
Show resolved Hide resolved
event_ids=prevs_to_ignore,
message=f"While computing context for event={event_id}, not attempting to pull missing prev_event={prevs_to_ignore[0]} because we already tried to pull recently (backing off).",
)
MadLittleMods marked this conversation as resolved.
Show resolved Hide resolved

if not missing_prevs:
return await self._state_handler.compute_event_context(event)

Expand Down
54 changes: 54 additions & 0 deletions synapse/storage/databases/main/event_federation.py
Original file line number Diff line number Diff line change
Expand Up @@ -1501,6 +1501,12 @@ async def record_event_failed_pull_attempt(
event_id: The event that failed to be fetched or processed
cause: The error message or reason that we failed to pull the event
"""
logger.debug(
"record_event_failed_pull_attempt room_id=%s, event_id=%s, cause=%s",
room_id,
event_id,
cause,
)
await self.db_pool.runInteraction(
"record_event_failed_pull_attempt",
self._record_event_failed_pull_attempt_upsert_txn,
Expand Down Expand Up @@ -1530,6 +1536,54 @@ def _record_event_failed_pull_attempt_upsert_txn(

txn.execute(sql, (room_id, event_id, 1, self._clock.time_msec(), cause))

@trace
async def get_event_ids_to_not_pull_from_backoff(
self,
room_id: str,
event_ids: Collection[str],
) -> List[str]:
"""
Filter down the events to ones that we've failed to pull before recently. Uses
exponential backoff.

Args:
room_id: The room that the events belong to
event_ids: A list of events to filter down

Returns:
List of event_ids that should not be attempted to be pulled
"""
event_failed_pull_attempts = await self.db_pool.simple_select_many_batch(
table="event_failed_pull_attempts",
column="event_id",
iterable=event_ids,
keyvalues={},
retcols=(
"event_id",
"last_attempt_ts",
"num_attempts",
),
desc="get_event_ids_to_not_pull_from_backoff",
)

current_time = self._clock.time_msec()
return [
event_failed_pull_attempt["event_id"]
for event_failed_pull_attempt in event_failed_pull_attempts
# Exponential back-off (up to the upper bound) so we don't try to
# pull the same event over and over. ex. 2hr, 4hr, 8hr, 16hr, etc.
if current_time
< event_failed_pull_attempt["last_attempt_ts"]
+ (
2
** min(
event_failed_pull_attempt["num_attempts"],
BACKFILL_EVENT_EXPONENTIAL_BACKOFF_MAXIMUM_DOUBLING_STEPS,
)
)
* BACKFILL_EVENT_EXPONENTIAL_BACKOFF_STEP_MILLISECONDS
Comment on lines +1573 to +1584
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

^ Sanity check appreciated here to make sure we're doing the equivalent of

/**
* Exponential back-off (up to the upper bound) so we don't retry the
* same backfill point over and over. ex. 2hr, 4hr, 8hr, 16hr, etc.
*
* We use `1 << n` as a power of 2 equivalent for compatibility
* with older SQLites. The left shift equivalent only works with
* powers of 2 because left shift is a binary operation (base-2).
* Otherwise, we would use `power(2, n)` or the power operator, `2^n`.
*/
AND (
failed_backfill_attempt_info.event_id IS NULL
OR ? /* current_time */ >= failed_backfill_attempt_info.last_attempt_ts + (
(1 << {least_function}(failed_backfill_attempt_info.num_attempts, ? /* max doubling steps */))
* ? /* step */
)
)

]

async def get_missing_events(
self,
room_id: str,
Expand Down
Loading