From 2de725d7dc05f966f97c141fd08ae15bf276ffae Mon Sep 17 00:00:00 2001 From: Brendan Abolivier Date: Wed, 16 Nov 2022 17:28:44 +0000 Subject: [PATCH 1/9] Add optional support ICU for user search --- changelog.d/14464.feature | 1 + poetry.lock | 16 ++++- pyproject.toml | 7 ++ .../storage/databases/main/user_directory.py | 67 +++++++++++++++++-- 4 files changed, 85 insertions(+), 6 deletions(-) create mode 100644 changelog.d/14464.feature diff --git a/changelog.d/14464.feature b/changelog.d/14464.feature new file mode 100644 index 000000000000..688ea32117e5 --- /dev/null +++ b/changelog.d/14464.feature @@ -0,0 +1 @@ +Improve user search for international display names. diff --git a/poetry.lock b/poetry.lock index 8d468adf1295..430357e4d614 100644 --- a/poetry.lock +++ b/poetry.lock @@ -837,6 +837,14 @@ category = "dev" optional = false python-versions = ">=3.5" +[[package]] +name = "pyicu" +version = "2.10.2" +description = "Python extension wrapping the ICU C++ API" +category = "main" +optional = true +python-versions = "*" + [[package]] name = "pyjwt" version = "2.4.0" @@ -1621,7 +1629,7 @@ docs = ["Sphinx", "repoze.sphinx.autointerface"] test = ["zope.i18nmessageid", "zope.testing", "zope.testrunner"] [extras] -all = ["matrix-synapse-ldap3", "psycopg2", "psycopg2cffi", "psycopg2cffi-compat", "pysaml2", "authlib", "lxml", "sentry-sdk", "jaeger-client", "opentracing", "txredisapi", "hiredis", "Pympler"] +all = ["matrix-synapse-ldap3", "psycopg2", "psycopg2cffi", "psycopg2cffi-compat", "pysaml2", "authlib", "lxml", "sentry-sdk", "jaeger-client", "opentracing", "txredisapi", "hiredis", "Pympler", "pyicu"] cache-memory = ["Pympler"] jwt = ["authlib"] matrix-synapse-ldap3 = ["matrix-synapse-ldap3"] @@ -1634,11 +1642,12 @@ sentry = ["sentry-sdk"] systemd = ["systemd-python"] test = ["parameterized", "idna"] url-preview = ["lxml"] +user-search = ["pyicu"] [metadata] lock-version = "1.1" python-versions = "^3.7.1" -content-hash = "27811bd21d56ceeb0f68ded5a00375efcd1a004928f0736f5b02927ce8594cb0" +content-hash = "41bbd333a8fb3f0f84f1d75f6dab90431553af7686d4bed49881cdfd4977366a" [metadata.files] attrs = [ @@ -2426,6 +2435,9 @@ pygments = [ {file = "Pygments-2.11.2-py3-none-any.whl", hash = "sha256:44238f1b60a76d78fc8ca0528ee429702aae011c265fe6a8dd8b63049ae41c65"}, {file = "Pygments-2.11.2.tar.gz", hash = "sha256:4e426f72023d88d03b2fa258de560726ce890ff3b630f88c21cbb8b2503b8c6a"}, ] +pyicu = [ + {file = "PyICU-2.10.2.tar.gz", hash = "sha256:0c3309eea7fab6857507ace62403515b60fe096cbfb4f90d14f55ff75c5441c1"}, +] pyjwt = [ {file = "PyJWT-2.4.0-py3-none-any.whl", hash = "sha256:72d1d253f32dbd4f5c88eaf1fdc62f3a19f676ccbadb9dbc5d07e951b2b26daf"}, {file = "PyJWT-2.4.0.tar.gz", hash = "sha256:d42908208c699b3b973cbeb01a969ba6a96c821eefb1c5bfe4c390c01d67abba"}, diff --git a/pyproject.toml b/pyproject.toml index e07a208e67b6..a3660e77bdbc 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -207,6 +207,7 @@ hiredis = { version = "*", optional = true } Pympler = { version = "*", optional = true } parameterized = { version = ">=0.7.4", optional = true } idna = { version = ">=2.5", optional = true } +pyicu = { version = ">=2.10.2", optional = true } [tool.poetry.extras] # NB: Packages that should be part of `pip install matrix-synapse[all]` need to be specified @@ -229,6 +230,10 @@ redis = ["txredisapi", "hiredis"] # Required to use experimental `caches.track_memory_usage` config option. cache-memory = ["pympler"] test = ["parameterized", "idna"] +# Allows for better search for international characters in the user directory. This +# requires libicu's development headers installed on the system (e.g. libicu-dev on +# Debian-based distributions). +user-search = ["pyicu"] # The duplication here is awful. I hate hate hate hate hate it. However, for now I want # to ensure you can still `pip install matrix-synapse[all]` like today. Two motivations: @@ -260,6 +265,8 @@ all = [ "txredisapi", "hiredis", # cache-memory "pympler", + # improved user search + "pyicu", # omitted: # - test: it's useful to have this separate from dev deps in the olddeps job # - systemd: this is a system-based requirement diff --git a/synapse/storage/databases/main/user_directory.py b/synapse/storage/databases/main/user_directory.py index ddb25b5cea7f..60fb133031f8 100644 --- a/synapse/storage/databases/main/user_directory.py +++ b/synapse/storage/databases/main/user_directory.py @@ -26,6 +26,14 @@ cast, ) +try: + # Figure out if ICU support is available for searching users. + import icu # type: ignore[import] + + USE_ICU = True +except ModuleNotFoundError: + USE_ICU = False + from typing_extensions import TypedDict from synapse.api.errors import StoreError @@ -903,7 +911,7 @@ def _parse_query_sqlite(search_term: str) -> str: """ # Pull out the individual words, discarding any non-word characters. - results = re.findall(r"([\w\-]+)", search_term, re.UNICODE) + results = _parse_words(search_term) return " & ".join("(%s* OR %s)" % (result, result) for result in results) @@ -913,12 +921,63 @@ def _parse_query_postgres(search_term: str) -> Tuple[str, str, str]: We use this so that we can add prefix matching, which isn't something that is supported by default. """ - - # Pull out the individual words, discarding any non-word characters. - results = re.findall(r"([\w\-]+)", search_term, re.UNICODE) + results = _parse_words(search_term) both = " & ".join("(%s:* | %s)" % (result, result) for result in results) exact = " & ".join("%s" % (result,) for result in results) prefix = " & ".join("%s:*" % (result,) for result in results) return both, exact, prefix + + +def _parse_words(search_term: str) -> List[str]: + """Split the provided search string into a list of its words. + + If support for ICU (International Components for Unicode) is available, use it. + Otherwise, fall back to using a regex to detect word boundaries. This latter + solution works well enough for most latin-based languages, but doesn't work as well + with other languages. + + Args: + search_term: The search string. + + Returns: + A list of the words in the search string. + """ + if USE_ICU: + return _parse_words_icu(search_term) + + return re.findall(r"([\w\-]+)", search_term, re.UNICODE) + + +def _parse_words_icu(search_term: str) -> List[str]: + """Break down the provided search string into its individual words using ICU + (International Components for Unicode). + + Args: + search_term: The search string. + + Returns: + A list of the words in the search string. + """ + results = [] + breaker = icu.BreakIterator.createWordInstance(icu.Locale.getDefault()) + breaker.setText(search_term) + i = 0 + while True: + j = breaker.nextBoundary() + if j < 0: + break + + result = search_term[i:j] + + # libicu considers spaces and punctuation between words as words, but we don't + # want to include those in results as they would result in syntax errors in SQL + # queries (e.g. "foo bar" would result in the search query including "foo & & + # bar"). + if len(re.findall(r"([\w\-]+)", result, re.UNICODE)): + results.append(result) + + i = j + + return results From 7ea4b05f98ff4329c47414adc34993d42e28e443 Mon Sep 17 00:00:00 2001 From: Brendan Abolivier Date: Tue, 6 Dec 2022 11:23:53 +0000 Subject: [PATCH 2/9] Slightly improve method name --- synapse/storage/databases/main/user_directory.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/synapse/storage/databases/main/user_directory.py b/synapse/storage/databases/main/user_directory.py index 1c741263fbd3..0ac73e59df09 100644 --- a/synapse/storage/databases/main/user_directory.py +++ b/synapse/storage/databases/main/user_directory.py @@ -942,12 +942,12 @@ def _parse_words(search_term: str) -> List[str]: A list of the words in the search string. """ if USE_ICU: - return _parse_words_icu(search_term) + return _parse_words_with_icu(search_term) return re.findall(r"([\w\-]+)", search_term, re.UNICODE) -def _parse_words_icu(search_term: str) -> List[str]: +def _parse_words_with_icu(search_term: str) -> List[str]: """Break down the provided search string into its individual words using ICU (International Components for Unicode). From 131b30fd3ddd2f427fb861a57b78b2aa7dc2cc1a Mon Sep 17 00:00:00 2001 From: Brendan Abolivier Date: Tue, 6 Dec 2022 15:15:51 +0000 Subject: [PATCH 3/9] Add a test --- tests/storage/test_user_directory.py | 43 ++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) diff --git a/tests/storage/test_user_directory.py b/tests/storage/test_user_directory.py index 5b60cf5285e4..356879ea3f1a 100644 --- a/tests/storage/test_user_directory.py +++ b/tests/storage/test_user_directory.py @@ -11,6 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +import re from typing import Any, Dict, Set, Tuple from unittest import mock from unittest.mock import Mock, patch @@ -30,6 +31,12 @@ from tests.test_utils.event_injection import inject_member_event from tests.unittest import HomeserverTestCase, override_config +try: + import icu +except ImportError: + icu = None # type: ignore + + ALICE = "@alice:a" BOB = "@bob:b" BOBBY = "@bobby:a" @@ -461,3 +468,39 @@ def test_search_user_dir_stop_words(self) -> None: r["results"][0], {"user_id": BELA, "display_name": "Bela", "avatar_url": None}, ) + + +class UserDirectoryICUTestCase(HomeserverTestCase): + if not icu: + skip = "Requires PyICU" + + def prepare(self, reactor: MemoryReactor, clock: Clock, hs: HomeServer) -> None: + self.store = hs.get_datastores().main + self.user_dir_helper = GetUserDirectoryTables(self.store) + + def test_icu_word_boundary(self) -> None: + """Tests that we correctly detect word boundaries when ICU (International + Components for Unicode) support is available. + """ + + display_name = "Gáo" + + # This word is not broken down correctly by Python's regular expressions, + # likely because á is actually a lowercase a followed by a U+0301 combining + # acute accent. This is specifically something that ICU support fixes. + matches = re.findall(r"([\w\-]+)", display_name, re.UNICODE) + self.assertEqual(len(matches), 2) + + self.get_success( + self.store.update_profile_in_user_dir(ALICE, display_name, None) + ) + self.get_success(self.store.add_users_in_public_rooms("!room:id", (ALICE,))) + + # Check that searching for this user yields the correct result. + r = self.get_success(self.store.search_user_dir(BOB, display_name, 10)) + self.assertFalse(r["limited"]) + self.assertEqual(len(r["results"]), 1) + self.assertDictEqual( + r["results"][0], + {"user_id": ALICE, "display_name": display_name, "avatar_url": None}, + ) From b25b534a1f6a71ad88e13b73db21bb7862fd242a Mon Sep 17 00:00:00 2001 From: Brendan Abolivier Date: Tue, 6 Dec 2022 15:16:27 +0000 Subject: [PATCH 4/9] Add ICU support to docker image --- docker/Dockerfile | 1 + 1 file changed, 1 insertion(+) diff --git a/docker/Dockerfile b/docker/Dockerfile index 7f8756e8a4df..90c53749299f 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -97,6 +97,7 @@ RUN \ zlib1g-dev \ git \ curl \ + libicu-dev \ && rm -rf /var/lib/apt/lists/* From e4579d7227187b5a98f2f687ca659a819037a9ba Mon Sep 17 00:00:00 2001 From: Brendan Abolivier Date: Tue, 6 Dec 2022 15:21:12 +0000 Subject: [PATCH 5/9] Add dependency to libicu-dev on Debian packages This library seems to be readily available on all supported Ubuntu and Debian distros --- debian/control | 1 + 1 file changed, 1 insertion(+) diff --git a/debian/control b/debian/control index 86f5a66d021e..1b3208a0ca77 100644 --- a/debian/control +++ b/debian/control @@ -8,6 +8,7 @@ Build-Depends: dh-virtualenv (>= 1.1), libsystemd-dev, libpq-dev, + libicu-dev, lsb-release, python3-dev, python3, From 47449826d535fade0d031cf8391535abcefefdbb Mon Sep 17 00:00:00 2001 From: Brendan Abolivier Date: Tue, 6 Dec 2022 15:30:44 +0000 Subject: [PATCH 6/9] Debian changelog --- debian/changelog | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/debian/changelog b/debian/changelog index 6868660a7df3..66ca30b1bd8e 100644 --- a/debian/changelog +++ b/debian/changelog @@ -1,3 +1,10 @@ +matrix-synapse-py3 (1.74.0~rc1) UNRELEASED; urgency=medium + + * New dependency on libicu-dev to provide improved results for user + search. + + -- Synapse Packaging team Tue, 06 Dec 2022 15:28:10 +0000 + matrix-synapse-py3 (1.73.0~rc2) stable; urgency=medium * New Synapse release 1.73.0rc2. From 7806e21807b94eb77b1b4b965dec66669ca420d1 Mon Sep 17 00:00:00 2001 From: Brendan Abolivier Date: Tue, 6 Dec 2022 15:50:25 +0000 Subject: [PATCH 7/9] Fix debian package build --- docker/Dockerfile-dhvirtualenv | 1 + 1 file changed, 1 insertion(+) diff --git a/docker/Dockerfile-dhvirtualenv b/docker/Dockerfile-dhvirtualenv index 73165f6f85ed..4dc027447d2d 100644 --- a/docker/Dockerfile-dhvirtualenv +++ b/docker/Dockerfile-dhvirtualenv @@ -84,6 +84,7 @@ RUN apt-get update -qq -o Acquire::Languages=none \ python3-venv \ sqlite3 \ libpq-dev \ + libicu-dev \ xmlsec1 # Install rust and ensure it's in the PATH From d632735c14fe16362c6a81a908a0b3c059c9e107 Mon Sep 17 00:00:00 2001 From: Brendan Abolivier Date: Tue, 6 Dec 2022 16:31:06 +0000 Subject: [PATCH 8/9] Add stub for pyicu --- stubs/icu.pyi | 25 +++++++++++++++++++ .../storage/databases/main/user_directory.py | 2 +- 2 files changed, 26 insertions(+), 1 deletion(-) create mode 100644 stubs/icu.pyi diff --git a/stubs/icu.pyi b/stubs/icu.pyi new file mode 100644 index 000000000000..efeda7938a73 --- /dev/null +++ b/stubs/icu.pyi @@ -0,0 +1,25 @@ +# Copyright 2022 The Matrix.org Foundation C.I.C. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Stub for PyICU. + +class Locale: + @staticmethod + def getDefault() -> Locale: ... + +class BreakIterator: + @staticmethod + def createWordInstance(locale: Locale) -> BreakIterator: ... + def setText(self, text: str) -> None: ... + def nextBoundary(self) -> int: ... diff --git a/synapse/storage/databases/main/user_directory.py b/synapse/storage/databases/main/user_directory.py index 0ac73e59df09..c0ec2dd38ea8 100644 --- a/synapse/storage/databases/main/user_directory.py +++ b/synapse/storage/databases/main/user_directory.py @@ -28,7 +28,7 @@ try: # Figure out if ICU support is available for searching users. - import icu # type: ignore[import] + import icu USE_ICU = True except ModuleNotFoundError: From a6770a39d07b9c7965b0b46581746a56d0cdd879 Mon Sep 17 00:00:00 2001 From: Brendan Abolivier Date: Tue, 6 Dec 2022 17:37:21 +0000 Subject: [PATCH 9/9] Also add a dependency to pkg-config because pyicu requires it --- debian/control | 1 + docker/Dockerfile | 1 + docker/Dockerfile-dhvirtualenv | 1 + 3 files changed, 3 insertions(+) diff --git a/debian/control b/debian/control index 1b3208a0ca77..bc628cec085e 100644 --- a/debian/control +++ b/debian/control @@ -9,6 +9,7 @@ Build-Depends: libsystemd-dev, libpq-dev, libicu-dev, + pkg-config, lsb-release, python3-dev, python3, diff --git a/docker/Dockerfile b/docker/Dockerfile index 90c53749299f..92277ce6cb26 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -98,6 +98,7 @@ RUN \ git \ curl \ libicu-dev \ + pkg-config \ && rm -rf /var/lib/apt/lists/* diff --git a/docker/Dockerfile-dhvirtualenv b/docker/Dockerfile-dhvirtualenv index 4dc027447d2d..f3b5b00ce61a 100644 --- a/docker/Dockerfile-dhvirtualenv +++ b/docker/Dockerfile-dhvirtualenv @@ -85,6 +85,7 @@ RUN apt-get update -qq -o Acquire::Languages=none \ sqlite3 \ libpq-dev \ libicu-dev \ + pkg-config \ xmlsec1 # Install rust and ensure it's in the PATH