Skip to content
This repository has been archived by the owner on Apr 26, 2024. It is now read-only.

Delete messages from device_inbox table when deleting device #10969

Merged
merged 22 commits into from
Oct 27, 2021
Merged
Show file tree
Hide file tree
Changes from 15 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions changelog.d/10969.bugfix
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Fix a long-standing bug where messages in the `device_inbox` table for deleted devices would persist indefinitely. Contributed by @dklimpel and @JohannesKleine.
115 changes: 102 additions & 13 deletions synapse/storage/databases/main/devices.py
Original file line number Diff line number Diff line change
Expand Up @@ -955,6 +955,11 @@ def __init__(self, database: DatabasePool, db_conn, hs):
self._remove_duplicate_outbound_pokes,
)

self.db_pool.updates.register_background_update_handler(
"remove_deleted_devices_from_device_inbox",
self._remove_deleted_devices_from_device_inbox,
)

# a pair of background updates that were added during the 1.14 release cycle,
# but replaced with 58/06dlols_unique_idx.py
self.db_pool.updates.register_noop_background_update(
Expand Down Expand Up @@ -1045,6 +1050,69 @@ def _txn(txn):

return rows

async def _remove_deleted_devices_from_device_inbox(
self, progress: JsonDict, batch_size: int
) -> int:
"""A background update that deletes all device_inboxes for deleted devices.

This should only need to be run once (when users upgrade to v1.45.0)
DMRobertson marked this conversation as resolved.
Show resolved Hide resolved

Args:
progress: JsonDict used to store progress of this background update
batch_size: the maximum number of rows to retrieve in a single select query

Returns:
The number of deleted rows
"""

last_device_id = progress.get("device_id", "")

def _remove_deleted_devices_from_device_inbox_txn(
txn: LoggingTransaction,
) -> int:

sql = """
SELECT device_id
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

:( I forgot that device IDs are not globally unique, so I think we need to ensure that these queries handle (user_id, device_Id) as the unique data.

Unfortunately I think that complicates the deletion clause quite a bit.

Copy link
Contributor Author

@dklimpel dklimpel Oct 7, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think about the best query and did some tests.


left join and distinct

explain SELECT DISTINCT i.device_id, i.user_id
FROM   device_inbox AS i
LEFT   JOIN devices as d
on d.device_id = i.device_id and d.user_id = i.user_id
WHERE  d.device_id IS NULL and d.user_id IS NULL;
                                               QUERY PLAN
--------------------------------------------------------------------------------------------------------
 Unique  (cost=32671.54..32671.55 rows=1 width=61)
   ->  Sort  (cost=32671.54..32671.55 rows=1 width=61)
         Sort Key: i.device_id, i.user_id
         ->  Gather  (cost=1292.03..32671.53 rows=1 width=61)
               Workers Planned: 2
               ->  Hash Anti Join  (cost=292.02..31671.43 rows=1 width=61)
                     Hash Cond: ((i.device_id = d.device_id) AND (i.user_id = d.user_id))
                     ->  Parallel Seq Scan on device_inbox i  (cost=0.00..30665.66 rows=95166 width=61)
                     ->  Hash  (cost=220.01..220.01 rows=4801 width=62)
                           ->  Seq Scan on devices d  (cost=0.00..220.01 rows=4801 width=62)

Results has 515 rows
Same cost if I use group by instead distinct.


left join

explain SELECT i.device_id, i.user_id
FROM   device_inbox AS i
LEFT   JOIN devices as d
on d.device_id = i.device_id and d.user_id = i.user_id
WHERE  d.device_id IS NULL and d.user_id IS NULL;
                                         QUERY PLAN
--------------------------------------------------------------------------------------------
 Gather  (cost=1292.03..32671.53 rows=1 width=61)
   Workers Planned: 2
   ->  Hash Anti Join  (cost=292.02..31671.43 rows=1 width=61)
         Hash Cond: ((i.device_id = d.device_id) AND (i.user_id = d.user_id))
         ->  Parallel Seq Scan on device_inbox i  (cost=0.00..30665.66 rows=95166 width=61)
         ->  Hash  (cost=220.01..220.01 rows=4801 width=62)
               ->  Seq Scan on devices d  (cost=0.00..220.01 rows=4801 width=62)

Result has 51353 rows


left join in sub query and distinct

explain with dev as (
SELECT i.device_id, i.user_id
FROM   device_inbox AS i
LEFT   JOIN devices as d
on d.device_id = i.device_id and d.user_id = i.user_id
WHERE  d.device_id IS NULL and d.user_id IS NULL
limit 100
)
select distinct * from dev;
                                                  QUERY PLAN
--------------------------------------------------------------------------------------------------------------
 Unique  (cost=32671.55..32671.56 rows=1 width=61)
   ->  Sort  (cost=32671.55..32671.56 rows=1 width=61)
         Sort Key: i.device_id, i.user_id
         ->  Limit  (cost=1292.03..32671.53 rows=1 width=61)
               ->  Gather  (cost=1292.03..32671.53 rows=1 width=61)
                     Workers Planned: 2
                     ->  Hash Anti Join  (cost=292.02..31671.43 rows=1 width=61)
                           Hash Cond: ((i.device_id = d.device_id) AND (i.user_id = d.user_id))
                           ->  Parallel Seq Scan on device_inbox i  (cost=0.00..30665.66 rows=95166 width=61)
                           ->  Hash  (cost=220.01..220.01 rows=4801 width=62)
                                 ->  Seq Scan on devices d  (cost=0.00..220.01 rows=4801 width=62)

Result has 6 rows


not in

explain SELECT i.device_id, i.user_id
FROM   device_inbox AS i
WHERE  ( i.device_id, i.user_id )
         NOT IN (SELECT device_id,
                        user_id
                 FROM   devices);
                                QUERY PLAN
--------------------------------------------------------------------------
 Seq Scan on device_inbox i  (cost=232.01..33372.00 rows=114200 width=61)
   Filter: (NOT (hashed SubPlan 1))
   SubPlan 1
     ->  Seq Scan on devices  (cost=0.00..220.01 rows=4801 width=62)

Result has 51353 rows


not in and distinct

explain SELECT Distinct i.device_id, i.user_id
FROM   device_inbox AS i
WHERE  ( i.device_id, i.user_id )
         NOT IN (SELECT device_id,
                        user_id
                 FROM   devices);
                                   QUERY PLAN
--------------------------------------------------------------------------------
 HashAggregate  (cost=33943.00..34171.18 rows=22818 width=61)
   Group Key: i.device_id, i.user_id
   ->  Seq Scan on device_inbox i  (cost=232.01..33372.00 rows=114200 width=61)
         Filter: (NOT (hashed SubPlan 1))
         SubPlan 1
           ->  Seq Scan on devices  (cost=0.00..220.01 rows=4801 width=62)

Result has 515 rows


not in in sub query with and distinct with limit

explain
with dev as (
SELECT i.device_id, i.user_id
FROM   device_inbox AS i
WHERE  ( i.device_id, i.user_id )
         NOT IN (SELECT device_id,
                        user_id
                 FROM   devices)
)
select distinct * from dev limit 100;
                                      QUERY PLAN
--------------------------------------------------------------------------------------
 Limit  (cost=33983.86..33984.86 rows=100 width=61)
   ->  HashAggregate  (cost=33983.86..34212.31 rows=22845 width=61)
         Group Key: i.device_id, i.user_id
         ->  Seq Scan on device_inbox i  (cost=232.03..33412.17 rows=114338 width=61)
               Filter: (NOT (hashed SubPlan 1))
               SubPlan 1
                 ->  Seq Scan on devices  (cost=0.00..220.02 rows=4802 width=62)

not in in sub query with limit and distinct

explain
with dev as (
SELECT i.device_id, i.user_id
FROM   device_inbox AS i
WHERE  ( i.device_id, i.user_id )
         NOT IN (SELECT device_id,
                        user_id
                 FROM   devices)
limit 100)
select distinct * from dev;
                                      QUERY PLAN
--------------------------------------------------------------------------------------
 HashAggregate  (cost=262.53..263.53 rows=100 width=61)
   Group Key: i.device_id, i.user_id
   ->  Limit  (cost=232.01..261.03 rows=100 width=61)
         ->  Seq Scan on device_inbox i  (cost=232.01..33372.00 rows=114200 width=61)
               Filter: (NOT (hashed SubPlan 1))
               SubPlan 1
                 ->  Seq Scan on devices  (cost=0.00..220.01 rows=4801 width=62)

Result has 6 rows


IMO the last one is the best solution. The disadvantage is that the cleanup of the database can take a lot of time, but with small costs.

@clokep Do you have better suggestions or an other opinion?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

not exists

explain SELECT i.device_id, i.user_id
FROM   device_inbox AS i
WHERE  NOT EXISTS (
   SELECT  device_id, user_id
   FROM   devices as d
   WHERE  d.device_id=i.device_id and d.user_id=i.user_id
   );
                                         QUERY PLAN
--------------------------------------------------------------------------------------------
 Gather  (cost=1292.03..32671.53 rows=1 width=61)
   Workers Planned: 2
   ->  Hash Anti Join  (cost=292.02..31671.43 rows=1 width=61)
         Hash Cond: ((i.device_id = d.device_id) AND (i.user_id = d.user_id))
         ->  Parallel Seq Scan on device_inbox i  (cost=0.00..30665.66 rows=95166 width=61)
         ->  Hash  (cost=220.01..220.01 rows=4801 width=62)
               ->  Seq Scan on devices d  (cost=0.00..220.01 rows=4801 width=62)

Result has 51353 rows

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

explain delete from device_inbox where device_id not in (select device_id from devices) AND stream_id >= 230020 AND stream_id <230030;
                                                     QUERY PLAN                                                     
--------------------------------------------------------------------------------------------------------------------
 Delete on device_inbox  (cost=0.99..4982976.37 rows=266 width=6)
   ->  Index Scan using device_inbox_stream_id_user_id on device_inbox  (cost=0.99..4982976.37 rows=266 width=6)
         Index Cond: ((stream_id >= 230020) AND (stream_id < 230030))
         Filter: (NOT (SubPlan 1))
         SubPlan 1
           ->  Materialize  (cost=0.42..18196.19 rows=200267 width=20)
                 ->  Index Only Scan using device_uniqueness on devices  (cost=0.42..16020.85 rows=200267 width=20)
 JIT:
   Functions: 8
   Options: Inlining true, Optimization true, Expressions true, Deforming true
(10 rows)

This is the way it works for me. As you can see it uses the already existing index which speeds up the process. Otherwise it will easily run over days / weeks and blocking the database when writing the result for a long time.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The trick is, to get the highest stream_id, generate small chunks (1000 is a good value) of stream_id ranges and execute the query as background update. Hope this helps, i tested it with a bash script manually to find good values, but was unable to write it in python, so i am very happy to see someone is continuing. I dropped in my pull request the background job related things.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The device_id is not unique. Multiple users can have the same device_id. Unique is the tuple of device_id and user_id.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

explain select device_id, user_id from device_inbox where device_id not in (select device_id from devices);
                               QUERY PLAN
------------------------------------------------------------------------
 Seq Scan on device_inbox  (cost=232.03..32840.47 rows=114338 width=61)
   Filter: (NOT (hashed SubPlan 1))
   SubPlan 1
     ->  Seq Scan on devices  (cost=0.00..220.02 rows=4802 width=11)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The device_id is not unique. Multiple users can have the same device_id. Unique is the tuple of device_id and user_id.

ok, my posted delete query is wrong

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Cost of the delete

explain DELETE FROM device_inbox WHERE device_id = 'dummy' AND user_id = 'dummy';
                                              QUERY PLAN
------------------------------------------------------------------------------------------------------
 Delete on device_inbox  (cost=0.42..8.44 rows=1 width=6)
   ->  Index Scan using device_inbox_user_stream_id on device_inbox  (cost=0.42..8.44 rows=1 width=6)
         Index Cond: ((user_id = 'dummy'::text) AND (device_id = 'dummy'::text))

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I have no streaam_id 230000.

explain delete from device_inbox where device_id not in (select device_id from devices) AND stream_id >= 230020 AND stream_id <230030;
                                                 QUERY PLAN
-------------------------------------------------------------------------------------------------------------
 Delete on device_inbox  (cost=232.44..240.47 rows=1 width=6)
   ->  Index Scan using device_inbox_stream_id_user_id on device_inbox  (cost=232.44..240.47 rows=1 width=6)
         Index Cond: ((stream_id >= 230020) AND (stream_id < 230030))
         Filter: (NOT (hashed SubPlan 1))
         SubPlan 1
           ->  Seq Scan on devices  (cost=0.00..220.02 rows=4802 width=11)

explain delete from device_inbox where device_id not in (select device_id from devices);
                                 QUERY PLAN
-----------------------------------------------------------------------------
 Delete on device_inbox  (cost=232.03..32840.47 rows=114338 width=6)
   ->  Seq Scan on device_inbox  (cost=232.03..32840.47 rows=114338 width=6)
         Filter: (NOT (hashed SubPlan 1))
         SubPlan 1
           ->  Seq Scan on devices  (cost=0.00..220.02 rows=4802 width=11)

explain delete from device_inbox where device_id not in (select device_id from devices) AND stream_id >= 20 AND stream_id <23030;
                                        QUERY PLAN
------------------------------------------------------------------------------------------
 Delete on device_inbox  (cost=232.03..33983.86 rows=45320 width=6)
   ->  Seq Scan on device_inbox  (cost=232.03..33983.86 rows=45320 width=6)
         Filter: ((NOT (hashed SubPlan 1)) AND (stream_id >= 20) AND (stream_id < 23030))
         SubPlan 1
           ->  Seq Scan on devices  (cost=0.00..220.02 rows=4802 width=11)

FROM device_inbox
WHERE device_id
NOT IN (SELECT device_id FROM devices)
AND device_id > ?
ORDER BY device_id ASC
LIMIT ?;
"""

txn.execute(sql, (last_device_id, batch_size))
device_ids_to_delete = [row[0] for row in txn]

count_deleted_devices = self.db_pool.simple_delete_many_txn(
txn,
"device_inbox",
column="device_id",
values=device_ids_to_delete,
keyvalues={},
)

if device_ids_to_delete:
self.db_pool.updates._background_update_progress_txn(
txn,
"remove_deleted_devices_from_device_inbox",
{"device_id": device_ids_to_delete[-1]},
)

return count_deleted_devices

number_deleted = await self.db_pool.runInteraction(
"_remove_deleted_devices_from_device_inbox",
_remove_deleted_devices_from_device_inbox_txn,
)

if number_deleted < batch_size:
await self.db_pool.updates._end_background_update(
"remove_deleted_devices_from_device_inbox"
)

return number_deleted


class DeviceStore(DeviceWorkerStore, DeviceBackgroundUpdateStore):
def __init__(self, database: DatabasePool, db_conn, hs):
Expand Down Expand Up @@ -1121,18 +1189,27 @@ async def store_device(
raise StoreError(500, "Problem storing device.")

async def delete_device(self, user_id: str, device_id: str) -> None:
"""Delete a device.
"""Delete a device and its device_inbox.

Args:
user_id: The ID of the user which owns the device
device_id: The ID of the device to delete
"""
await self.db_pool.simple_delete_one(
table="devices",
keyvalues={"user_id": user_id, "device_id": device_id, "hidden": False},
desc="delete_device",
)

def _delete_device_txn(txn: LoggingTransaction) -> None:
self.db_pool.simple_delete_one_txn(
txn,
table="devices",
keyvalues={"user_id": user_id, "device_id": device_id, "hidden": False},
)

self.db_pool.simple_delete_txn(
txn,
table="device_inbox",
keyvalues={"user_id": user_id, "device_id": device_id},
)

await self.db_pool.runInteraction("delete_device", _delete_device_txn)
self.device_id_exists_cache.invalidate((user_id, device_id))
clokep marked this conversation as resolved.
Show resolved Hide resolved

async def delete_devices(self, user_id: str, device_ids: List[str]) -> None:
Expand All @@ -1142,13 +1219,25 @@ async def delete_devices(self, user_id: str, device_ids: List[str]) -> None:
user_id: The ID of the user which owns the devices
device_ids: The IDs of the devices to delete
"""
await self.db_pool.simple_delete_many(
table="devices",
column="device_id",
iterable=device_ids,
keyvalues={"user_id": user_id, "hidden": False},
desc="delete_devices",
)

def _delete_devices_txn(txn: LoggingTransaction) -> None:
self.db_pool.simple_delete_many_txn(
txn,
table="devices",
column="device_id",
values=device_ids,
keyvalues={"user_id": user_id, "hidden": False},
)

self.db_pool.simple_delete_many_txn(
txn,
table="device_inbox",
column="device_id",
values=device_ids,
keyvalues={"user_id": user_id},
)

await self.db_pool.runInteraction("delete_devices", _delete_devices_txn)
for device_id in device_ids:
self.device_id_exists_cache.invalidate((user_id, device_id))

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
/* Copyright 2021 The Matrix.org Foundation C.I.C
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/


--- Remove messages from the device_inbox table which where sent to an
--- allready deleted device.
--- This schould run as background task, it may take a little bit longer
--- to finish.
dklimpel marked this conversation as resolved.
Show resolved Hide resolved

INSERT INTO background_updates (ordering, update_name, progress_json) VALUES
(6402, 'remove_deleted_devices_from_device_inbox', '{}');
31 changes: 31 additions & 0 deletions tests/handlers/test_device.py
Original file line number Diff line number Diff line change
Expand Up @@ -160,6 +160,37 @@ def test_delete_device(self):
# we'd like to check the access token was invalidated, but that's a
# bit of a PITA.

def test_delete_device_and_device_inbox(self):
self._record_users()

# add an device_inbox
self.get_success(
self.store.db_pool.simple_insert(
"device_inbox",
{
"user_id": user1,
"device_id": "abc",
"stream_id": 1,
"message_json": "{}",
},
)
)

# delete the device
self.get_success(self.handler.delete_device(user1, "abc"))

# check that the device_inbox was deleted
res = self.get_success(
self.store.db_pool.simple_select_one(
table="device_inbox",
keyvalues={"user_id": user1, "device_id": "abc"},
retcols=("user_id", "device_id"),
allow_none=True,
desc="get_device_id_from_device_inbox",
)
)
self.assertIsNone(res)

def test_update_device(self):
self._record_users()

Expand Down
96 changes: 96 additions & 0 deletions tests/storage/databases/main/test_devices.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
# Copyright 2021 The Matrix.org Foundation C.I.C.
#
# Licensed under the Apache License, Version 2.0 (the 'License');
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an 'AS IS' BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from synapse.rest import admin
from synapse.rest.client import devices

from tests.unittest import HomeserverTestCase


class DevicesBackgroundUpdateStoreTestCase(HomeserverTestCase):

servlets = [
admin.register_servlets,
devices.register_servlets,
]

def prepare(self, reactor, clock, hs):
self.store = hs.get_datastore()
self.user_id = self.register_user("foo", "pass")

def test_background_remove_deleted_devices_from_device_inbox(self):
"""Test that the background task to delete old device_inboxes works properly."""

# create a valid device
self.get_success(
self.store.store_device(self.user_id, "cur_device", "display_name")
)

# Add device_inbox to devices
self.get_success(
self.store.db_pool.simple_insert(
"device_inbox",
{
"user_id": self.user_id,
"device_id": "cur_device",
"stream_id": 1,
"message_json": "{}",
},
)
)
self.get_success(
self.store.db_pool.simple_insert(
"device_inbox",
{
"user_id": self.user_id,
"device_id": "old_device",
"stream_id": 2,
"message_json": "{}",
},
)
)

# Insert and run the background update.
self.get_success(
self.store.db_pool.simple_insert(
"background_updates",
{
"update_name": "remove_deleted_devices_from_device_inbox",
"progress_json": "{}",
},
)
)

# ... and tell the DataStore that it hasn't finished all updates yet
self.store.db_pool.updates._all_done = False

# Now let's actually drive the updates to completion
while not self.get_success(
self.store.db_pool.updates.has_completed_background_updates()
):
self.get_success(
self.store.db_pool.updates.do_next_background_update(100), by=0.1
)
clokep marked this conversation as resolved.
Show resolved Hide resolved

# Make sure the background task deleted old device_inbox
res = self.get_success(
self.store.db_pool.simple_select_onecol(
table="device_inbox",
keyvalues={},
retcol="device_id",
desc="get_device_id_from_device_inbox",
)
)
self.assertEqual(1, len(res))
DMRobertson marked this conversation as resolved.
Show resolved Hide resolved
self.assertEqual(res[0], "cur_device")