From d7eb206795fbae939c313d5915e942b14da4a1cb Mon Sep 17 00:00:00 2001 From: Richard van der Hoff Date: Sun, 11 Aug 2019 16:24:45 +0100 Subject: [PATCH 1/5] Servlet to purge old rooms --- changelog.d/5845.feature | 1 + docs/admin_api/purge_room.md | 18 +++++ synapse/handlers/pagination.py | 17 +++++ synapse/rest/admin/__init__.py | 2 + synapse/rest/admin/purge_room_servlet.py | 58 +++++++++++++++++ synapse/storage/events.py | 83 ++++++++++++++++++++++++ 6 files changed, 179 insertions(+) create mode 100644 changelog.d/5845.feature create mode 100644 docs/admin_api/purge_room.md create mode 100644 synapse/rest/admin/purge_room_servlet.py diff --git a/changelog.d/5845.feature b/changelog.d/5845.feature new file mode 100644 index 000000000000..7b0dc9a95e7d --- /dev/null +++ b/changelog.d/5845.feature @@ -0,0 +1 @@ +Add an admin API to purge old rooms from the database. diff --git a/docs/admin_api/purge_room.md b/docs/admin_api/purge_room.md new file mode 100644 index 000000000000..64ea7b6a648e --- /dev/null +++ b/docs/admin_api/purge_room.md @@ -0,0 +1,18 @@ +Purge room API +============== + +This API will remove all trace of a room from your database. + +All local users must have left the room before it can be removed. + +The API is: + +``` +POST /_synapse/admin/v1/purge_room + +{ + "room_id": "!room:id" +} +``` + +You must authenticate using the access token of an admin user. diff --git a/synapse/handlers/pagination.py b/synapse/handlers/pagination.py index d83aab3f74b5..5744f4579d21 100644 --- a/synapse/handlers/pagination.py +++ b/synapse/handlers/pagination.py @@ -70,6 +70,7 @@ def __init__(self, hs): self.auth = hs.get_auth() self.store = hs.get_datastore() self.clock = hs.get_clock() + self._server_name = hs.hostname self.pagination_lock = ReadWriteLock() self._purges_in_progress_by_room = set() @@ -153,6 +154,22 @@ def get_purge_status(self, purge_id): """ return self._purges_by_id.get(purge_id) + async def purge_room(self, room_id): + """Purge the given room from the database""" + with (await self.pagination_lock.write(room_id)): + # check we know about the room + await self.store.get_room_version(room_id) + + # first check that we have no users in this room + joined = await defer.maybeDeferred( + self.store.is_host_joined, room_id, self._server_name + ) + + if joined: + raise SynapseError(400, "Users are still joined to this room") + + await self.store.purge_room(room_id) + @defer.inlineCallbacks def get_messages( self, diff --git a/synapse/rest/admin/__init__.py b/synapse/rest/admin/__init__.py index 0a7d9b81b27b..6628c9a56a3c 100644 --- a/synapse/rest/admin/__init__.py +++ b/synapse/rest/admin/__init__.py @@ -37,6 +37,7 @@ parse_string, ) from synapse.rest.admin._base import assert_requester_is_admin, assert_user_is_admin +from synapse.rest.admin.purge_room_servlet import PurgeRoomServlet from synapse.rest.admin.server_notice_servlet import SendServerNoticeServlet from synapse.types import UserID, create_requester from synapse.util.versionstring import get_version_string @@ -818,6 +819,7 @@ def register_servlets(hs, http_server): Register all the admin servlets. """ register_servlets_for_client_rest_resource(hs, http_server) + PurgeRoomServlet(hs).register(http_server) SendServerNoticeServlet(hs).register(http_server) VersionServlet(hs).register(http_server) diff --git a/synapse/rest/admin/purge_room_servlet.py b/synapse/rest/admin/purge_room_servlet.py new file mode 100644 index 000000000000..3b05412bcb13 --- /dev/null +++ b/synapse/rest/admin/purge_room_servlet.py @@ -0,0 +1,58 @@ +# -*- coding: utf-8 -*- +# Copyright 2019 The Matrix.org Foundation C.I.C. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import re + +from synapse.http.servlet import ( + RestServlet, + assert_params_in_dict, + parse_json_object_from_request, +) +from synapse.rest.admin import assert_requester_is_admin + + +class PurgeRoomServlet(RestServlet): + """Servlet which will remove all trace of an a room from the database + + POST /_synapse/admin/v1/purge_room + { + "room_id": "!room:id" + } + + returns: + + { + } + """ + + PATTERNS = (re.compile("^/_synapse/admin/v1/purge_room$"),) + + def __init__(self, hs): + """ + Args: + hs (synapse.server.HomeServer): server + """ + self.hs = hs + self.auth = hs.get_auth() + self.pagination_handler = hs.get_pagination_handler() + + async def on_POST(self, request): + await assert_requester_is_admin(self.auth, request) + + body = parse_json_object_from_request(request) + assert_params_in_dict(body, ("room_id",)) + + await self.pagination_handler.purge_room(body["room_id"]) + + return (200, {}) diff --git a/synapse/storage/events.py b/synapse/storage/events.py index 88c01801164f..e6c8f9654056 100644 --- a/synapse/storage/events.py +++ b/synapse/storage/events.py @@ -2177,6 +2177,89 @@ def _find_unreferenced_groups_during_purge(self, txn, state_groups): return to_delete, to_dedelta + def purge_room(self, room_id): + """Deletes all record of a room + + Args: + room_id (str): + """ + + return self.runInteraction("purge_room", self._purge_room_txn, room_id) + + def _purge_room_txn(self, txn, room_id): + # first we have to delete the state groups states + logger.info("[purge] removing %s from state_groups_state", room_id) + + txn.execute( + "DELETE FROM state_groups_state " + "WHERE state_group IN (" + "SELECT state_group FROM events JOIN event_to_state_groups USING(event_id) " + "WHERE events.room_id=?" + ")", + (room_id,), + ) + + # ... and the state group edges + logger.info("[purge] removing %s from state_group_edges", room_id) + + txn.execute( + "DELETE FROM state_group_edges " + "WHERE state_group IN (" + "SELECT state_group FROM events JOIN event_to_state_groups USING(event_id) " + "WHERE events.room_id=?" + ")", + (room_id,), + ) + + # ... and the state groups + logger.info("[purge] removing %s from state_groups", room_id) + + txn.execute( + "DELETE FROM state_groups " + "WHERE id IN (" + "SELECT state_group FROM events JOIN event_to_state_groups USING(event_id) " + "WHERE events.room_id=?" + ")", + (room_id,), + ) + + # and then tables which lack an index on room_id but have one on event_id + for table in ( + "event_auth", + "event_edges", + "event_reference_hashes", + "event_to_state_groups", + "rejections", + "state_events", + ): + logger.info("[purge] removing %s from %s", room_id, table) + + txn.execute( + "DELETE FROM %s WHERE event_id IN (SELECT event_id FROM events " + "WHERE room_id=?)" % (table,), + (room_id,), + ) + + # and finally, the tables with an index on room_id + for table in ( + "current_state_events", + "event_backward_extremities", + "event_forward_extremities", + "event_json", + "event_push_actions", + "event_search", + "events", + "receipts_graph", + "receipts_linearized", + "room_depth", + "room_memberships", + "rooms", + ): + logger.info("[purge] removing %s from %s", room_id, table) + txn.execute("DELETE FROM %s WHERE room_id=?" % (table,), (room_id,)) + + logger.info("[purge] done") + @defer.inlineCallbacks def is_event_after(self, event_id1, event_id2): """Returns True if event_id1 is after event_id2 in the stream From 01f61a77e6e875e716ab5581235bba53f98476d6 Mon Sep 17 00:00:00 2001 From: Richard van der Hoff <1389908+richvdh@users.noreply.github.com> Date: Mon, 19 Aug 2019 11:18:50 +0100 Subject: [PATCH 2/5] Apply suggestions from code review Co-Authored-By: Andrew Morgan <1342360+anoadragon453@users.noreply.github.com> --- synapse/rest/admin/purge_room_servlet.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/synapse/rest/admin/purge_room_servlet.py b/synapse/rest/admin/purge_room_servlet.py index 3b05412bcb13..2922eb543ed4 100644 --- a/synapse/rest/admin/purge_room_servlet.py +++ b/synapse/rest/admin/purge_room_servlet.py @@ -23,7 +23,7 @@ class PurgeRoomServlet(RestServlet): - """Servlet which will remove all trace of an a room from the database + """Servlet which will remove all trace of a room from the database POST /_synapse/admin/v1/purge_room { @@ -32,8 +32,7 @@ class PurgeRoomServlet(RestServlet): returns: - { - } + {} """ PATTERNS = (re.compile("^/_synapse/admin/v1/purge_room$"),) From effbec3c283394c2d3f8b25892c1908bd9a4b622 Mon Sep 17 00:00:00 2001 From: Richard van der Hoff Date: Tue, 20 Aug 2019 12:20:19 +0100 Subject: [PATCH 3/5] multiline strings ftw --- synapse/storage/events.py | 41 +++++++++++++++++++++++---------------- 1 file changed, 24 insertions(+), 17 deletions(-) diff --git a/synapse/storage/events.py b/synapse/storage/events.py index ae8e26648b0b..1930917693aa 100644 --- a/synapse/storage/events.py +++ b/synapse/storage/events.py @@ -2205,11 +2205,12 @@ def _purge_room_txn(self, txn, room_id): logger.info("[purge] removing %s from state_groups_state", room_id) txn.execute( - "DELETE FROM state_groups_state " - "WHERE state_group IN (" - "SELECT state_group FROM events JOIN event_to_state_groups USING(event_id) " - "WHERE events.room_id=?" - ")", + """ + DELETE FROM state_groups_state WHERE state_group IN ( + SELECT state_group FROM events JOIN event_to_state_groups USING(event_id) + WHERE events.room_id=? + ) + """, (room_id,), ) @@ -2217,11 +2218,12 @@ def _purge_room_txn(self, txn, room_id): logger.info("[purge] removing %s from state_group_edges", room_id) txn.execute( - "DELETE FROM state_group_edges " - "WHERE state_group IN (" - "SELECT state_group FROM events JOIN event_to_state_groups USING(event_id) " - "WHERE events.room_id=?" - ")", + """ + DELETE FROM state_group_edges WHERE state_group IN ( + SELECT state_group FROM events JOIN event_to_state_groups USING(event_id) + WHERE events.room_id=? + ) + """, (room_id,), ) @@ -2229,11 +2231,12 @@ def _purge_room_txn(self, txn, room_id): logger.info("[purge] removing %s from state_groups", room_id) txn.execute( - "DELETE FROM state_groups " - "WHERE id IN (" - "SELECT state_group FROM events JOIN event_to_state_groups USING(event_id) " - "WHERE events.room_id=?" - ")", + """ + DELETE FROM state_groups WHERE id IN ( + SELECT state_group FROM events JOIN event_to_state_groups USING(event_id) + WHERE events.room_id=? + ) + """, (room_id,), ) @@ -2249,8 +2252,12 @@ def _purge_room_txn(self, txn, room_id): logger.info("[purge] removing %s from %s", room_id, table) txn.execute( - "DELETE FROM %s WHERE event_id IN (SELECT event_id FROM events " - "WHERE room_id=?)" % (table,), + """ + DELETE FROM %s WHERE event_id IN ( + SELECT event_id FROM events WHERE room_id=? + ) + """ + % (table,), (room_id,), ) From cb50616b6b028d612f9a4c6a838ecbf6aec1f597 Mon Sep 17 00:00:00 2001 From: Richard van der Hoff Date: Tue, 20 Aug 2019 16:49:51 +0100 Subject: [PATCH 4/5] purge some more tables --- synapse/storage/events.py | 50 ++++++++++++++++++++++++++++++++++++++- 1 file changed, 49 insertions(+), 1 deletion(-) diff --git a/synapse/storage/events.py b/synapse/storage/events.py index 1930917693aa..66ff86a29c22 100644 --- a/synapse/storage/events.py +++ b/synapse/storage/events.py @@ -2244,8 +2244,12 @@ def _purge_room_txn(self, txn, room_id): for table in ( "event_auth", "event_edges", + "event_push_actions_staging", "event_reference_hashes", + "event_relations", "event_to_state_groups", + "group_rooms", + "redactions", "rejections", "state_events", ): @@ -2261,7 +2265,7 @@ def _purge_room_txn(self, txn, room_id): (room_id,), ) - # and finally, the tables with an index on room_id + # and finally, the tables with an index on room_id (or no useful index) for table in ( "current_state_events", "event_backward_extremities", @@ -2270,15 +2274,59 @@ def _purge_room_txn(self, txn, room_id): "event_push_actions", "event_search", "events", + "group_rooms", + "public_room_list_stream", "receipts_graph", "receipts_linearized", + "room_aliases", "room_depth", "room_memberships", + "room_state", + "room_stats", + "room_stats_earliest_token", "rooms", + "stream_ordering_to_exterm", + "topics", + "users_in_public_rooms", + "users_who_share_private_rooms", + # no useful index, but let's clear them anyway + "appservice_room_list", + "e2e_room_keys", + "event_push_summary", + "pusher_throttle", + "group_summary_rooms", + "local_invites", + "room_account_data", + "room_tags", ): logger.info("[purge] removing %s from %s", room_id, table) txn.execute("DELETE FROM %s WHERE room_id=?" % (table,), (room_id,)) + # Other tables we do NOT need to clear out: + # + # - blocked_rooms + # This is important, to make sure that we don't accidentally rejoin a blocked + # room after it was purged + # + # - user_directory + # This has a room_id column, but it is unused + # + + # Other tables that we might want to consider clearing out include: + # + # - event_reports + # Given that these are intended for abuse management my initial + # inclination is to leave them in place. + # + # - current_state_delta_stream + # - ex_outlier_stream + # - room_tags_revisions + # The problem with these is that they are largeish and there is no room_id + # index on them. In any case we should be clearing out 'stream' tables + # periodically anyway (#5888) + + # TODO: we could probably usefully do a bunch of cache invalidation here + logger.info("[purge] done") @defer.inlineCallbacks From ff71d38cf5b607c84cdab3d69a727bf4200d8807 Mon Sep 17 00:00:00 2001 From: Richard van der Hoff Date: Tue, 20 Aug 2019 17:08:03 +0100 Subject: [PATCH 5/5] remove duplicate group_rooms --- synapse/storage/events.py | 1 - 1 file changed, 1 deletion(-) diff --git a/synapse/storage/events.py b/synapse/storage/events.py index 66ff86a29c22..c99f6c58c1ac 100644 --- a/synapse/storage/events.py +++ b/synapse/storage/events.py @@ -2248,7 +2248,6 @@ def _purge_room_txn(self, txn, room_id): "event_reference_hashes", "event_relations", "event_to_state_groups", - "group_rooms", "redactions", "rejections", "state_events",