From 18e5f6075618360956db8398dd96ada9399bf5ab Mon Sep 17 00:00:00 2001 From: Sean Quah Date: Tue, 30 Aug 2022 18:31:57 +0100 Subject: [PATCH 1/7] Fix bug in device list caching when remote users leave rooms When a remote user leaves the last room shared with the homeserver, we have to mark their device list as unsubscribed, otherwise we will hold on to a stale device list in our cache. Crucially, the device list will remain cached even after the remote user rejoins the room, which can lead to E2EE failures until the remote user's device list next changes. Signed-off-by: Sean Quah --- synapse/storage/controllers/persist_events.py | 20 ++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/synapse/storage/controllers/persist_events.py b/synapse/storage/controllers/persist_events.py index dad3731b9b50..501dbbc99011 100644 --- a/synapse/storage/controllers/persist_events.py +++ b/synapse/storage/controllers/persist_events.py @@ -598,9 +598,9 @@ async def _persist_event_batch( # room state_delta_for_room: Dict[str, DeltaState] = {} - # Set of remote users which were in rooms the server has left. We - # should check if we still share any rooms and if not we mark their - # device lists as stale. + # Set of remote users which were in rooms the server has left or who may + # have left rooms the server is in. We should check if we still share any + # rooms and if not we mark their device lists as stale. potentially_left_users: Set[str] = set() if not backfilled: @@ -725,6 +725,20 @@ async def _persist_event_batch( current_state = {} delta.no_longer_in_room = True + # Add all remote users that might have left rooms. + potentially_left_users.update( + user_id + for event_type, user_id in delta.to_delete + if event_type == EventTypes.Member + and not self.is_mine_id(user_id) + ) + potentially_left_users.update( + user_id + for event_type, user_id in delta.to_insert.keys() + if event_type == EventTypes.Member + and not self.is_mine_id(user_id) + ) + state_delta_for_room[room_id] = delta await self.persist_events_store._persist_events_and_state_updates( From 766b136db3042d990f75ae87898c7ef2aef8371a Mon Sep 17 00:00:00 2001 From: Sean Quah Date: Tue, 30 Aug 2022 19:38:13 +0100 Subject: [PATCH 2/7] Remove redundant `mark_remote_user_device_list_as_unsubscribed` call Now that we are handling device list unsubscriptions in the event persistence code, it's no longer necessary to mark device lists as unsubscribed elsewhere. --- synapse/handlers/device.py | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/synapse/handlers/device.py b/synapse/handlers/device.py index c5ac169644ac..901e2310b706 100644 --- a/synapse/handlers/device.py +++ b/synapse/handlers/device.py @@ -45,7 +45,6 @@ JsonDict, StreamKeyType, StreamToken, - UserID, get_domain_from_id, get_verify_key_from_cross_signing_key, ) @@ -324,8 +323,6 @@ def __init__(self, hs: "HomeServer"): self.device_list_updater.incoming_device_list_update, ) - hs.get_distributor().observe("user_left_room", self.user_left_room) - # Whether `_handle_new_device_update_async` is currently processing. self._handle_new_device_update_is_processing = False @@ -569,14 +566,6 @@ async def notify_user_signature_update( StreamKeyType.DEVICE_LIST, position, users=[from_user_id] ) - async def user_left_room(self, user: UserID, room_id: str) -> None: - user_id = user.to_string() - room_ids = await self.store.get_rooms_for_user(user_id) - if not room_ids: - # We no longer share rooms with this user, so we'll no longer - # receive device updates. Mark this in DB. - await self.store.mark_remote_user_device_list_as_unsubscribed(user_id) - async def store_dehydrated_device( self, user_id: str, From 17dd4419a7af661cb637a8346ab0f31a5ac1ad2f Mon Sep 17 00:00:00 2001 From: Sean Quah Date: Wed, 7 Sep 2022 20:37:09 +0100 Subject: [PATCH 3/7] Check whether we still share a room when using cached device lists Signed-off-by: Sean Quah --- synapse/handlers/e2e_keys.py | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/synapse/handlers/e2e_keys.py b/synapse/handlers/e2e_keys.py index ec81639c7825..9d97b7688bb0 100644 --- a/synapse/handlers/e2e_keys.py +++ b/synapse/handlers/e2e_keys.py @@ -175,6 +175,33 @@ async def query_devices( user_ids_not_in_cache, remote_results, ) = await self.store.get_user_devices_from_cache(query_list) + + # Check that the homeserver still shares a room with all cached users. + # Note that this check may be slightly racy when a remote user leaves a + # room after we have fetched their cached device list. In the worst case + # we will do extra federation queries for devices that we had cached. + cached_users = set(remote_results.keys()) + valid_cached_users = ( + await self.store.get_users_server_still_shares_room_with( + remote_results.keys() + ) + ) + invalid_cached_users = cached_users - valid_cached_users + if invalid_cached_users: + # Fix up results. If we get here there may be bugs in device list + # tracking. + # TODO: is this actually useful? it doesn't fix the case where a + # remote user rejoins a room and we query their device list + # after. + user_ids_not_in_cache.update(invalid_cached_users) + for invalid_user_id in invalid_cached_users: + remote_results.pop(invalid_user_id) + logger.error( + "Devices for %s were cached, but the server no longer shares " + "any rooms with them. The cached device lists are stale.", + invalid_cached_users, + ) + for user_id, devices in remote_results.items(): user_devices = results.setdefault(user_id, {}) for device_id, device in devices.items(): From 10595c87a3048356210ea814f6a183eb1223e691 Mon Sep 17 00:00:00 2001 From: Sean Quah Date: Thu, 8 Sep 2022 15:53:36 +0100 Subject: [PATCH 4/7] Fix insufficient mocking in test --- tests/handlers/test_e2e_keys.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/tests/handlers/test_e2e_keys.py b/tests/handlers/test_e2e_keys.py index 1e6ad4b663e9..95698bc27585 100644 --- a/tests/handlers/test_e2e_keys.py +++ b/tests/handlers/test_e2e_keys.py @@ -891,6 +891,12 @@ def test_query_all_devices_caches_result(self, device_ids: Iterable[str]) -> Non new_callable=mock.MagicMock, return_value=make_awaitable(["some_room_id"]), ) + mock_get_users = mock.patch.object( + self.store, + "get_users_server_still_shares_room_with", + new_callable=mock.MagicMock, + return_value=make_awaitable({remote_user_id}), + ) mock_request = mock.patch.object( self.hs.get_federation_client(), "query_user_devices", @@ -898,7 +904,7 @@ def test_query_all_devices_caches_result(self, device_ids: Iterable[str]) -> Non return_value=make_awaitable(response_body), ) - with mock_get_rooms, mock_request as mocked_federation_request: + with mock_get_rooms, mock_get_users, mock_request as mocked_federation_request: # Make the first query and sanity check it succeeds. response_1 = self.get_success( e2e_handler.query_devices( From 5a12cb7346b78492b283a13d121aae1c311aca93 Mon Sep 17 00:00:00 2001 From: Sean Quah Date: Thu, 8 Sep 2022 14:04:01 +0100 Subject: [PATCH 5/7] Add newsfile --- changelog.d/13749.bugfix | 1 + 1 file changed, 1 insertion(+) create mode 100644 changelog.d/13749.bugfix diff --git a/changelog.d/13749.bugfix b/changelog.d/13749.bugfix new file mode 100644 index 000000000000..8ffafec07b33 --- /dev/null +++ b/changelog.d/13749.bugfix @@ -0,0 +1 @@ +Fix a long standing bug where device lists would remain cached when remote users left and rejoined the last room shared with the local homeserver. From d9287047dc596f92629554522123844006758939 Mon Sep 17 00:00:00 2001 From: Sean Quah <8349537+squahtx@users.noreply.github.com> Date: Fri, 9 Sep 2022 14:26:33 +0100 Subject: [PATCH 6/7] Update synapse/handlers/e2e_keys.py --- synapse/handlers/e2e_keys.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/synapse/handlers/e2e_keys.py b/synapse/handlers/e2e_keys.py index 9d97b7688bb0..b26067200c50 100644 --- a/synapse/handlers/e2e_keys.py +++ b/synapse/handlers/e2e_keys.py @@ -190,9 +190,6 @@ async def query_devices( if invalid_cached_users: # Fix up results. If we get here there may be bugs in device list # tracking. - # TODO: is this actually useful? it doesn't fix the case where a - # remote user rejoins a room and we query their device list - # after. user_ids_not_in_cache.update(invalid_cached_users) for invalid_user_id in invalid_cached_users: remote_results.pop(invalid_user_id) From f104cc1c8cf0eb7969c40e1546706f472c4865d4 Mon Sep 17 00:00:00 2001 From: Sean Quah Date: Tue, 13 Sep 2022 16:18:43 +0100 Subject: [PATCH 7/7] fixup: add comment --- synapse/handlers/e2e_keys.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/synapse/handlers/e2e_keys.py b/synapse/handlers/e2e_keys.py index b26067200c50..8eed63ccf3ac 100644 --- a/synapse/handlers/e2e_keys.py +++ b/synapse/handlers/e2e_keys.py @@ -188,11 +188,13 @@ async def query_devices( ) invalid_cached_users = cached_users - valid_cached_users if invalid_cached_users: - # Fix up results. If we get here there may be bugs in device list - # tracking. + # Fix up results. If we get here, there is either a bug in device + # list tracking, or we hit the race mentioned above. user_ids_not_in_cache.update(invalid_cached_users) for invalid_user_id in invalid_cached_users: remote_results.pop(invalid_user_id) + # This log message may be removed if it turns out it's almost + # entirely triggered by races. logger.error( "Devices for %s were cached, but the server no longer shares " "any rooms with them. The cached device lists are stale.",