Skip to content
This repository has been archived by the owner on Apr 26, 2024. It is now read-only.

Handle non-strings in the event_search table in synapse_port_db #12037

Merged
merged 13 commits into from
Feb 24, 2022
1 change: 1 addition & 0 deletions changelog.d/12037.bugfix
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Clean up after a long-standing bug that was fixed by accident in Synapse 1.44.0 where integers could be inserted into the `event_search` table when using sqlite.
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The wording's a little clumsy here. Suggestions are welcome!

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe this is a little cleaner? Just a thought!

Suggested change
Clean up after a long-standing bug that was fixed by accident in Synapse 1.44.0 where integers could be inserted into the `event_search` table when using sqlite.
Clean up after a long-standing bug where integers could be inserted into the `event_search` table when using sqlite. This bug was fixed by accident in Synapse 1.44.0

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would it help to mention symptoms?

Fixed a long-standing bug where wrong data could be inserted in the event_search table when using sqlite. This could block running synapse_port_db with an "argument of type 'int' is not iterable" error.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thank you both for the suggestions!

6 changes: 3 additions & 3 deletions synapse/storage/databases/main/events.py
Original file line number Diff line number Diff line change
Expand Up @@ -1954,19 +1954,19 @@ def _handle_redaction(self, txn, redacted_event_id):
)

def _store_room_topic_txn(self, txn, event):
clokep marked this conversation as resolved.
Show resolved Hide resolved
if hasattr(event, "content") and "topic" in event.content:
if hasattr(event, "content") and isinstance(event.content.get("topic"), str):
clokep marked this conversation as resolved.
Show resolved Hide resolved
self.store_event_search_txn(
txn, event, "content.topic", event.content["topic"]
)

def _store_room_name_txn(self, txn, event):
if hasattr(event, "content") and "name" in event.content:
if hasattr(event, "content") and isinstance(event.content.get("name"), str):
self.store_event_search_txn(
txn, event, "content.name", event.content["name"]
)

def _store_room_message_txn(self, txn, event):
if hasattr(event, "content") and "body" in event.content:
if hasattr(event, "content") and isinstance(event.content.get("body"), str):
self.store_event_search_txn(
txn, event, "content.body", event.content["body"]
)
Expand Down
27 changes: 27 additions & 0 deletions synapse/storage/databases/main/search.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,7 @@ class SearchBackgroundUpdateStore(SearchWorkerStore):
EVENT_SEARCH_ORDER_UPDATE_NAME = "event_search_order"
EVENT_SEARCH_USE_GIST_POSTGRES_NAME = "event_search_postgres_gist"
EVENT_SEARCH_USE_GIN_POSTGRES_NAME = "event_search_postgres_gin"
EVENT_SEARCH_DELETE_NON_STRINGS = "event_search_sqlite_delete_non_strings"

def __init__(
self,
Expand Down Expand Up @@ -147,6 +148,10 @@ def __init__(
self.EVENT_SEARCH_USE_GIN_POSTGRES_NAME, self._background_reindex_gin_search
)

self.db_pool.updates.register_background_update_handler(
self.EVENT_SEARCH_DELETE_NON_STRINGS, self._background_delete_non_strings
)

async def _background_reindex_search(self, progress, batch_size):
# we work through the events table from highest stream id to lowest
target_min_stream_id = progress["target_min_stream_id_inclusive"]
Expand Down Expand Up @@ -372,6 +377,28 @@ def reindex_search_txn(txn):

return num_rows

async def _background_delete_non_strings(
self, progress: JsonDict, batch_size: int
) -> int:
"""Deletes rows with non-string `value`s from `event_search` if using sqlite.

Prior to Synapse 1.44.0, malformed events received over federation could cause integers
to be inserted into the `event_search` table when using sqlite.
"""

def delete_non_strings_txn(txn: LoggingTransaction) -> None:
txn.execute("DELETE FROM event_search WHERE typeof(value) != 'text'")
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we need to limit this in any way? (Will this cause any locks on the table?)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good question! It crossed my mind at some point and I forgot to make a note of it on the PR.

event_search is defined as a full text search virtual table with no explicit indexes:
CREATE VIRTUAL TABLE event_search USING fts4 (event_id, room_id, sender, key, value);
so it looked like doing the processing in batches would be slower, since DELETEing by specific event_ids would have to scan the whole table anyway.

I think this will lock the table for the duration of the DELETE. The sqlite docs suggest that locks are done on the file level, so it's actually the entire database that gets locked, which is unfortunate.

We could add an index first, then delete in batches, but if we're going to ask sqlite to scan the whole table to build an index, we might as well just do the delete.


if isinstance(self.database_engine, Sqlite3Engine):
squahtx marked this conversation as resolved.
Show resolved Hide resolved
await self.db_pool.runInteraction(
self.EVENT_SEARCH_DELETE_NON_STRINGS, delete_non_strings_txn
)

await self.db_pool.updates._end_background_update(
self.EVENT_SEARCH_DELETE_NON_STRINGS
)
return 1


class SearchStore(SearchBackgroundUpdateStore):
def __init__(
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
/* Copyright 2022 The Matrix.org Foundation C.I.C
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/


-- Delete rows with non-string `value`s from `event_search` if using sqlite.
--
-- Prior to Synapse 1.44.0, malformed events received over federation could
-- cause integers to be inserted into the `event_search` table.
INSERT INTO background_updates (ordering, update_name, progress_json) VALUES
(6805, 'event_search_sqlite_delete_non_strings', '{}');
clokep marked this conversation as resolved.
Show resolved Hide resolved
114 changes: 112 additions & 2 deletions tests/storage/test_room_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,13 +13,16 @@
# limitations under the License.

import synapse.rest.admin
from synapse.api.constants import EventTypes
from synapse.api.errors import StoreError
from synapse.rest.client import login, room
from synapse.storage.engines import PostgresEngine

from tests.unittest import HomeserverTestCase
from tests.unittest import HomeserverTestCase, skip_unless
from tests.utils import USE_POSTGRES_FOR_TESTS


class NullByteInsertionTest(HomeserverTestCase):
class EventSearchInsertionTest(HomeserverTestCase):
servlets = [
synapse.rest.admin.register_servlets_for_client_rest_resource,
login.register_servlets,
Expand Down Expand Up @@ -72,3 +75,110 @@ def test_null_byte(self):
)
if isinstance(store.database_engine, PostgresEngine):
self.assertIn("alice", result.get("highlights"))

@skip_unless(not USE_POSTGRES_FOR_TESTS, "requires sqlite")
clokep marked this conversation as resolved.
Show resolved Hide resolved
def test_sqlite_non_string(self):
"""Test that non-string `value`s are not inserted into `event_search` on sqlite.

Regression test for #11918.
"""
store = self.hs.get_datastore()

# Register a user and create a room
user_id = self.register_user("alice", "password")
access_token = self.login("alice", "password")
room_id = self.helper.create_room_as("alice", tok=access_token)
room_version = self.get_success(store.get_room_version(room_id))

# Construct a message with a numeric body to be received over federation
# The message can't be sent using the client API, since Synapse's event
# validation will reject it.
prev_event_ids = self.get_success(store.get_prev_events_for_room(room_id))
prev_event = self.get_success(store.get_event(prev_event_ids[0]))
prev_state_map = self.get_success(
self.hs.get_storage().state.get_state_ids_for_event(prev_event_ids[0])
)

event_dict = {
"type": EventTypes.Message,
"content": {"msgtype": "m.text", "body": 2},
"room_id": room_id,
"sender": user_id,
"depth": prev_event.depth + 1,
"prev_events": prev_event_ids,
"origin_server_ts": self.clock.time_msec(),
}
builder = self.hs.get_event_builder_factory().for_room_version(
room_version, event_dict
)
event = self.get_success(
builder.build(
prev_event_ids=prev_event_ids,
auth_event_ids=self.hs.get_event_auth_handler().compute_auth_events(
builder,
prev_state_map,
for_verification=False,
),
depth=event_dict["depth"],
)
)

# Receive the event
self.get_success(
self.hs.get_federation_event_handler().on_receive_pdu(
self.hs.hostname, event
)
)

# The event should not have an entry in the `event_search` table
f = self.get_failure(
store.db_pool.simple_select_one_onecol(
"event_search",
{"room_id": room_id, "event_id": event.event_id},
"value",
),
StoreError,
)
self.assertEqual(f.value.code, 404)

@skip_unless(not USE_POSTGRES_FOR_TESTS, "requires sqlite")
def test_sqlite_non_string_deletion_background_update(self):
"""Test the background update to delete bad rows from `event_search`."""
store = self.hs.get_datastore()

# Populate `event_search` with dummy data
self.get_success(
store.db_pool.simple_insert_many(
"event_search",
keys=["event_id", "room_id", "key", "value"],
values=[
("event1", "room_id", "content.body", "hi"),
("event2", "room_id", "content.body", "2"),
("event3", "room_id", "content.body", 3),
],
desc="populate_event_search",
)
)

# Run the background update
store.db_pool.updates._all_done = False
self.get_success(
store.db_pool.simple_insert(
"background_updates",
{
"update_name": "event_search_sqlite_delete_non_strings",
"progress_json": "{}",
},
)
)
self.wait_for_background_updates()

# The non-string `value`s ought to be gone now.
values = self.get_success(
store.db_pool.simple_select_onecol(
"event_search",
{"room_id": "room_id"},
"value",
),
)
self.assertCountEqual(values, ["hi", "2"])