From ca1df2691d6db03003b36a770905cd9ca17e2100 Mon Sep 17 00:00:00 2001 From: Patrick Cloke Date: Fri, 27 Aug 2021 11:29:36 -0400 Subject: [PATCH 1/8] Move the OEmbed logic to a separate object. --- synapse/rest/media/v1/oembed.py | 167 ++++++++++++++++++ synapse/rest/media/v1/preview_url_resource.py | 147 +-------------- tests/rest/media/v1/test_url_preview.py | 4 +- 3 files changed, 174 insertions(+), 144 deletions(-) create mode 100644 synapse/rest/media/v1/oembed.py diff --git a/synapse/rest/media/v1/oembed.py b/synapse/rest/media/v1/oembed.py new file mode 100644 index 000000000000..2937107d749b --- /dev/null +++ b/synapse/rest/media/v1/oembed.py @@ -0,0 +1,167 @@ +# Copyright 2021 The Matrix.org Foundation C.I.C. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import logging +import re +from typing import Optional +from urllib import parse as urlparse + +import attr + +from synapse.http.client import SimpleHttpClient + +logger = logging.getLogger(__name__) + + +# A map of globs to API endpoints. +_oembed_globs = { + # Twitter. + "https://publish.twitter.com/oembed": [ + "https://twitter.com/*/status/*", + "https://*.twitter.com/*/status/*", + "https://twitter.com/*/moments/*", + "https://*.twitter.com/*/moments/*", + # Include the HTTP versions too. + "http://twitter.com/*/status/*", + "http://*.twitter.com/*/status/*", + "http://twitter.com/*/moments/*", + "http://*.twitter.com/*/moments/*", + ], +} +# Convert the globs to regular expressions. +_oembed_patterns = {} +for endpoint, globs in _oembed_globs.items(): + for glob in globs: + # Convert the glob into a sane regular expression to match against. The + # rules followed will be slightly different for the domain portion vs. + # the rest. + # + # 1. The scheme must be one of HTTP / HTTPS (and have no globs). + # 2. The domain can have globs, but we limit it to characters that can + # reasonably be a domain part. + # TODO: This does not attempt to handle Unicode domain names. + # 3. Other parts allow a glob to be any one, or more, characters. + results = urlparse.urlparse(glob) + + # Ensure the scheme does not have wildcards (and is a sane scheme). + if results.scheme not in {"http", "https"}: + raise ValueError("Insecure oEmbed glob scheme: %s" % (results.scheme,)) + + pattern = urlparse.urlunparse( + [ + results.scheme, + re.escape(results.netloc).replace("\\*", "[a-zA-Z0-9_-]+"), + ] + + [re.escape(part).replace("\\*", ".+") for part in results[2:]] + ) + _oembed_patterns[re.compile(pattern)] = endpoint + + +@attr.s(slots=True) +class OEmbedResult: + # Either HTML content or URL must be provided. + html = attr.ib(type=Optional[str]) + url = attr.ib(type=Optional[str]) + title = attr.ib(type=Optional[str]) + # Number of seconds to cache the content. + cache_age = attr.ib(type=int) + + +class OEmbedError(Exception): + """An error occurred processing the oEmbed object.""" + + +class OEmbedProvider: + def __init__(self, client: SimpleHttpClient): + self._client = client + + def get_oembed_url(self, url: str) -> Optional[str]: + """ + Check whether the URL should be downloaded as oEmbed content instead. + + Args: + url: The URL to check. + + Returns: + A URL to use instead or None if the original URL should be used. + """ + for url_pattern, endpoint in _oembed_patterns.items(): + if url_pattern.fullmatch(url): + return endpoint + + # No match. + return None + + async def get_oembed_content(self, endpoint: str, url: str) -> OEmbedResult: + """ + Request content from an oEmbed endpoint. + + Args: + endpoint: The oEmbed API endpoint. + url: The URL to pass to the API. + + Returns: + An object representing the metadata returned. + + Raises: + OEmbedError if fetching or parsing of the oEmbed information fails. + """ + try: + logger.debug("Trying to get oEmbed content for url '%s'", url) + result = await self._client.get_json( + endpoint, + # TODO Specify max height / width. + # Note that only the JSON format is supported. + args={"url": url}, + ) + + # Ensure there's a version of 1.0. + if result.get("version") != "1.0": + raise OEmbedError("Invalid version: %s" % (result.get("version"),)) + + oembed_type = result.get("type") + + # Ensure the cache age is None or an int. + cache_age = result.get("cache_age") + if cache_age: + cache_age = int(cache_age) + + oembed_result = OEmbedResult(None, None, result.get("title"), cache_age) + + # HTML content. + if oembed_type == "rich": + oembed_result.html = result.get("html") + return oembed_result + + if oembed_type == "photo": + oembed_result.url = result.get("url") + return oembed_result + + # TODO Handle link and video types. + + if "thumbnail_url" in result: + oembed_result.url = result.get("thumbnail_url") + return oembed_result + + raise OEmbedError("Incompatible oEmbed information.") + + except OEmbedError as e: + # Trap OEmbedErrors first so we can directly re-raise them. + logger.warning("Error parsing oEmbed metadata from %s: %r", url, e) + raise + + except Exception as e: + # Trap any exception and let the code follow as usual. + # FIXME: pass through 404s and other error messages nicely + logger.warning("Error downloading oEmbed metadata from %s: %r", url, e) + raise OEmbedError() from e diff --git a/synapse/rest/media/v1/preview_url_resource.py b/synapse/rest/media/v1/preview_url_resource.py index 0f051d4041ef..96669d92621d 100644 --- a/synapse/rest/media/v1/preview_url_resource.py +++ b/synapse/rest/media/v1/preview_url_resource.py @@ -25,8 +25,6 @@ from typing import TYPE_CHECKING, Any, Dict, Generator, Iterable, Optional, Union from urllib import parse as urlparse -import attr - from twisted.internet.error import DNSLookupError from twisted.web.server import Request @@ -43,6 +41,7 @@ from synapse.metrics.background_process_metrics import run_as_background_process from synapse.rest.media.v1._base import get_filename_from_headers from synapse.rest.media.v1.media_storage import MediaStorage +from synapse.rest.media.v1.oembed import OEmbedError, OEmbedProvider from synapse.util import json_encoder from synapse.util.async_helpers import ObservableDeferred from synapse.util.caches.expiringcache import ExpiringCache @@ -71,63 +70,6 @@ ONE_HOUR = 60 * 60 * 1000 -# A map of globs to API endpoints. -_oembed_globs = { - # Twitter. - "https://publish.twitter.com/oembed": [ - "https://twitter.com/*/status/*", - "https://*.twitter.com/*/status/*", - "https://twitter.com/*/moments/*", - "https://*.twitter.com/*/moments/*", - # Include the HTTP versions too. - "http://twitter.com/*/status/*", - "http://*.twitter.com/*/status/*", - "http://twitter.com/*/moments/*", - "http://*.twitter.com/*/moments/*", - ], -} -# Convert the globs to regular expressions. -_oembed_patterns = {} -for endpoint, globs in _oembed_globs.items(): - for glob in globs: - # Convert the glob into a sane regular expression to match against. The - # rules followed will be slightly different for the domain portion vs. - # the rest. - # - # 1. The scheme must be one of HTTP / HTTPS (and have no globs). - # 2. The domain can have globs, but we limit it to characters that can - # reasonably be a domain part. - # TODO: This does not attempt to handle Unicode domain names. - # 3. Other parts allow a glob to be any one, or more, characters. - results = urlparse.urlparse(glob) - - # Ensure the scheme does not have wildcards (and is a sane scheme). - if results.scheme not in {"http", "https"}: - raise ValueError("Insecure oEmbed glob scheme: %s" % (results.scheme,)) - - pattern = urlparse.urlunparse( - [ - results.scheme, - re.escape(results.netloc).replace("\\*", "[a-zA-Z0-9_-]+"), - ] - + [re.escape(part).replace("\\*", ".+") for part in results[2:]] - ) - _oembed_patterns[re.compile(pattern)] = endpoint - - -@attr.s(slots=True) -class OEmbedResult: - # Either HTML content or URL must be provided. - html = attr.ib(type=Optional[str]) - url = attr.ib(type=Optional[str]) - title = attr.ib(type=Optional[str]) - # Number of seconds to cache the content. - cache_age = attr.ib(type=int) - - -class OEmbedError(Exception): - """An error occurred processing the oEmbed object.""" - class PreviewUrlResource(DirectServeJsonResource): isLeaf = True @@ -157,6 +99,8 @@ def __init__( self.primary_base_path = media_repo.primary_base_path self.media_storage = media_storage + self._oembed = OEmbedProvider(self.client) + # We run the background jobs if we're the instance specified (or no # instance is specified, where we assume there is only one instance # serving media). @@ -367,87 +311,6 @@ async def _do_preview(self, url: str, user: str, ts: int) -> bytes: return jsonog.encode("utf8") - def _get_oembed_url(self, url: str) -> Optional[str]: - """ - Check whether the URL should be downloaded as oEmbed content instead. - - Args: - url: The URL to check. - - Returns: - A URL to use instead or None if the original URL should be used. - """ - for url_pattern, endpoint in _oembed_patterns.items(): - if url_pattern.fullmatch(url): - return endpoint - - # No match. - return None - - async def _get_oembed_content(self, endpoint: str, url: str) -> OEmbedResult: - """ - Request content from an oEmbed endpoint. - - Args: - endpoint: The oEmbed API endpoint. - url: The URL to pass to the API. - - Returns: - An object representing the metadata returned. - - Raises: - OEmbedError if fetching or parsing of the oEmbed information fails. - """ - try: - logger.debug("Trying to get oEmbed content for url '%s'", url) - result = await self.client.get_json( - endpoint, - # TODO Specify max height / width. - # Note that only the JSON format is supported. - args={"url": url}, - ) - - # Ensure there's a version of 1.0. - if result.get("version") != "1.0": - raise OEmbedError("Invalid version: %s" % (result.get("version"),)) - - oembed_type = result.get("type") - - # Ensure the cache age is None or an int. - cache_age = result.get("cache_age") - if cache_age: - cache_age = int(cache_age) - - oembed_result = OEmbedResult(None, None, result.get("title"), cache_age) - - # HTML content. - if oembed_type == "rich": - oembed_result.html = result.get("html") - return oembed_result - - if oembed_type == "photo": - oembed_result.url = result.get("url") - return oembed_result - - # TODO Handle link and video types. - - if "thumbnail_url" in result: - oembed_result.url = result.get("thumbnail_url") - return oembed_result - - raise OEmbedError("Incompatible oEmbed information.") - - except OEmbedError as e: - # Trap OEmbedErrors first so we can directly re-raise them. - logger.warning("Error parsing oEmbed metadata from %s: %r", url, e) - raise - - except Exception as e: - # Trap any exception and let the code follow as usual. - # FIXME: pass through 404s and other error messages nicely - logger.warning("Error downloading oEmbed metadata from %s: %r", url, e) - raise OEmbedError() from e - async def _download_url(self, url: str, user: str) -> Dict[str, Any]: # TODO: we should probably honour robots.txt... except in practice # we're most likely being explicitly triggered by a human rather than a @@ -459,11 +322,11 @@ async def _download_url(self, url: str, user: str) -> Dict[str, Any]: # If this URL can be accessed via oEmbed, use that instead. url_to_download: Optional[str] = url - oembed_url = self._get_oembed_url(url) + oembed_url = self._oembed.get_oembed_url(url) if oembed_url: # The result might be a new URL to download, or it might be HTML content. try: - oembed_result = await self._get_oembed_content(oembed_url, url) + oembed_result = await self._oembed.get_oembed_content(oembed_url, url) if oembed_result.url: url_to_download = oembed_result.url elif oembed_result.html: diff --git a/tests/rest/media/v1/test_url_preview.py b/tests/rest/media/v1/test_url_preview.py index d3ef7bb4c637..d56598621717 100644 --- a/tests/rest/media/v1/test_url_preview.py +++ b/tests/rest/media/v1/test_url_preview.py @@ -546,7 +546,7 @@ def test_oembed_photo(self): """Test an oEmbed endpoint which returns a 'photo' type which redirects the preview to a new URL.""" # Route the HTTP version to an HTTP endpoint so that the tests work. with patch.dict( - "synapse.rest.media.v1.preview_url_resource._oembed_patterns", + "synapse.rest.media.v1.oembed._oembed_patterns", { re.compile( r"http://twitter\.com/.+/status/.+" @@ -619,7 +619,7 @@ def test_oembed_rich(self): """Test an oEmbed endpoint which returns HTML content via the 'rich' type.""" # Route the HTTP version to an HTTP endpoint so that the tests work. with patch.dict( - "synapse.rest.media.v1.preview_url_resource._oembed_patterns", + "synapse.rest.media.v1.oembed._oembed_patterns", { re.compile( r"http://twitter\.com/.+/status/.+" From bfcb3960f48b8275ddea3fd2a842fd78825a3112 Mon Sep 17 00:00:00 2001 From: Patrick Cloke Date: Fri, 27 Aug 2021 12:04:03 -0400 Subject: [PATCH 2/8] Pull oEmbed from a providers.json file and allow overriding the configuration. --- changelog.d/10714.feature | 1 + docs/sample_config.yaml | 19 ++ synapse/config/homeserver.py | 2 + synapse/config/oembed.py | 178 +++++++++++++++ synapse/res/providers.json | 17 ++ synapse/rest/media/v1/oembed.py | 42 +--- synapse/rest/media/v1/preview_url_resource.py | 2 +- tests/rest/media/v1/test_url_preview.py | 212 +++++++++--------- 8 files changed, 330 insertions(+), 143 deletions(-) create mode 100644 changelog.d/10714.feature create mode 100644 synapse/config/oembed.py create mode 100644 synapse/res/providers.json diff --git a/changelog.d/10714.feature b/changelog.d/10714.feature new file mode 100644 index 000000000000..bcb5a199f629 --- /dev/null +++ b/changelog.d/10714.feature @@ -0,0 +1 @@ +Allow configuration the oEmbed URLs used for URL previews. diff --git a/docs/sample_config.yaml b/docs/sample_config.yaml index 935841dbfa37..62a31efb9129 100644 --- a/docs/sample_config.yaml +++ b/docs/sample_config.yaml @@ -1075,6 +1075,25 @@ url_preview_accept_language: # - en +oembed: + # By default, the providers.json from https://oembed.com/ is included + # with Synapse. + # + # Uncomment the following to disable using these default oEmbed URLs. + # Defaults to 'false'. + # + #disable_default_providers: true + + # Additional files with oEmbed configuration (each should be in the + # form of providers.json). + # + # By default, this list is empty (so only the default providers.json + # is used). + # + #additional_providers: + # - oembed/my_providers.json + + ## Captcha ## # See docs/CAPTCHA_SETUP.md for full details of configuring this. diff --git a/synapse/config/homeserver.py b/synapse/config/homeserver.py index 1f42a51857c6..442f1b9ac071 100644 --- a/synapse/config/homeserver.py +++ b/synapse/config/homeserver.py @@ -30,6 +30,7 @@ from .logger import LoggingConfig from .metrics import MetricsConfig from .modules import ModulesConfig +from .oembed import OembedConfig from .oidc import OIDCConfig from .password_auth_providers import PasswordAuthProviderConfig from .push import PushConfig @@ -65,6 +66,7 @@ class HomeServerConfig(RootConfig): LoggingConfig, RatelimitConfig, ContentRepositoryConfig, + OembedConfig, CaptchaConfig, VoipConfig, RegistrationConfig, diff --git a/synapse/config/oembed.py b/synapse/config/oembed.py new file mode 100644 index 000000000000..6d5110195695 --- /dev/null +++ b/synapse/config/oembed.py @@ -0,0 +1,178 @@ +# Copyright 2021 The Matrix.org Foundation C.I.C. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import json +import re +from typing import Any, Dict, Iterable, List, Pattern +from urllib import parse as urlparse + +import attr +import pkg_resources + +from synapse.types import JsonDict + +from ._base import Config, ConfigError +from ._util import validate_config + + +@attr.s(slots=True, frozen=True, auto_attribs=True) +class OEmbedEndpointConfig: + # The API endpoint to fetch. + api_endpoint: str + # The patterns to match. + url_patterns: List[Pattern] + + +class OembedConfig(Config): + """oEmbed Configuration""" + + section = "oembed" + + def read_config(self, config, **kwargs): + oembed_config: Dict[str, Any] = config.get("oembed") or {} + + # A list of patterns which will be used. + self.oembed_patterns: List[OEmbedEndpointConfig] = list( + self._parse_and_validate_providers(oembed_config) + ) + + def _parse_and_validate_providers( + self, oembed_config: dict + ) -> Iterable[OEmbedEndpointConfig]: + """Extract and parse the oEmbed providers from the given JSON file. + + Returns a generator which yields the OidcProviderConfig objects + """ + # Whether to use the packaged providers.json file. + if not oembed_config.get("disable_default_providers") or False: + providers = json.load( + pkg_resources.resource_stream("synapse", "res/providers.json") + ) + yield from self._parse_and_validate_provider( + providers, config_path=("oembed",) + ) + + # The JSON files which includes additional provider information. + for i, file in enumerate(oembed_config.get("additional_providers") or []): + # TODO Error checking. + with open(file) as f: + providers = json.load(f) + + yield from self._parse_and_validate_provider( + providers, + config_path=( + "oembed", + "additional_providers", + f"", + ), + ) + + def _parse_and_validate_provider( + self, providers: List[JsonDict], config_path: Iterable[str] + ) -> Iterable[OEmbedEndpointConfig]: + # Ensure it is the proper form. + validate_config( + _OEMBED_PROVIDER_SCHEMA, + providers, + config_path=config_path, + ) + + # Parse it and yield each result. + for provider in providers: + # Each provider might have multiple API endpoints, each which + # might have multiple patterns to match. + for endpoint in provider["endpoints"]: + api_endpoint = endpoint["url"] + patterns = [ + self._glob_to_pattern(glob, config_path) + for glob in endpoint["schemes"] + ] + yield OEmbedEndpointConfig(api_endpoint, patterns) + + def _glob_to_pattern(self, glob: str, config_path: Iterable[str]) -> Pattern: + """ + Convert the glob into a sane regular expression to match against. The + rules followed will be slightly different for the domain portion vs. + the rest. + + 1. The scheme must be one of HTTP / HTTPS (and have no globs). + 2. The domain can have globs, but we limit it to characters that can + reasonably be a domain part. + TODO: This does not attempt to handle Unicode domain names. + TODO: The domain should not allow wildcard TLDs. + 3. Other parts allow a glob to be any one, or more, characters. + """ + results = urlparse.urlparse(glob) + + # Ensure the scheme does not have wildcards (and is a sane scheme). + if results.scheme not in {"http", "https"}: + raise ConfigError(f"Insecure oEmbed scheme: {results.scheme}", config_path) + + pattern = urlparse.urlunparse( + [ + results.scheme, + re.escape(results.netloc).replace("\\*", "[a-zA-Z0-9_-]+"), + ] + + [re.escape(part).replace("\\*", ".+") for part in results[2:]] + ) + return re.compile(pattern) + + def generate_config_section(self, **kwargs): + return """\ + oembed: + # By default, the providers.json from https://oembed.com/ is included + # with Synapse. + # + # Uncomment the following to disable using these default oEmbed URLs. + # Defaults to 'false'. + # + #disable_default_providers: true + + # Additional files with oEmbed configuration (each should be in the + # form of providers.json). + # + # By default, this list is empty (so only the default providers.json + # is used). + # + #additional_providers: + # - oembed/my_providers.json + """ + + +_OEMBED_PROVIDER_SCHEMA = { + "type": "array", + "items": { + "type": "object", + "properties": { + "provider_name": {"type": "string"}, + "provider_url": {"type": "string"}, + "endpoints": { + "type": "array", + "items": { + "type": "object", + "properties": { + "schemes": { + "type": "array", + "items": {"type": "string"}, + }, + "url": {"type": "string"}, + "formats": {"type": "array", "items": {"type": "string"}}, + "discovery": {"type": "boolean"}, + }, + "required": ["schemes", "url"], + }, + }, + }, + "required": ["provider_name", "provider_url", "endpoints"], + }, +} diff --git a/synapse/res/providers.json b/synapse/res/providers.json new file mode 100644 index 000000000000..f1838f955901 --- /dev/null +++ b/synapse/res/providers.json @@ -0,0 +1,17 @@ +[ + { + "provider_name": "Twitter", + "provider_url": "http://www.twitter.com/", + "endpoints": [ + { + "schemes": [ + "https://twitter.com/*/status/*", + "https://*.twitter.com/*/status/*", + "https://twitter.com/*/moments/*", + "https://*.twitter.com/*/moments/*" + ], + "url": "https://publish.twitter.com/oembed" + } + ] + } +] \ No newline at end of file diff --git a/synapse/rest/media/v1/oembed.py b/synapse/rest/media/v1/oembed.py index 2937107d749b..a2472d030f24 100644 --- a/synapse/rest/media/v1/oembed.py +++ b/synapse/rest/media/v1/oembed.py @@ -12,14 +12,15 @@ # See the License for the specific language governing permissions and # limitations under the License. import logging -import re -from typing import Optional -from urllib import parse as urlparse +from typing import TYPE_CHECKING, Optional import attr from synapse.http.client import SimpleHttpClient +if TYPE_CHECKING: + from synapse.server import HomeServer + logger = logging.getLogger(__name__) @@ -38,33 +39,6 @@ "http://*.twitter.com/*/moments/*", ], } -# Convert the globs to regular expressions. -_oembed_patterns = {} -for endpoint, globs in _oembed_globs.items(): - for glob in globs: - # Convert the glob into a sane regular expression to match against. The - # rules followed will be slightly different for the domain portion vs. - # the rest. - # - # 1. The scheme must be one of HTTP / HTTPS (and have no globs). - # 2. The domain can have globs, but we limit it to characters that can - # reasonably be a domain part. - # TODO: This does not attempt to handle Unicode domain names. - # 3. Other parts allow a glob to be any one, or more, characters. - results = urlparse.urlparse(glob) - - # Ensure the scheme does not have wildcards (and is a sane scheme). - if results.scheme not in {"http", "https"}: - raise ValueError("Insecure oEmbed glob scheme: %s" % (results.scheme,)) - - pattern = urlparse.urlunparse( - [ - results.scheme, - re.escape(results.netloc).replace("\\*", "[a-zA-Z0-9_-]+"), - ] - + [re.escape(part).replace("\\*", ".+") for part in results[2:]] - ) - _oembed_patterns[re.compile(pattern)] = endpoint @attr.s(slots=True) @@ -82,7 +56,11 @@ class OEmbedError(Exception): class OEmbedProvider: - def __init__(self, client: SimpleHttpClient): + def __init__(self, hs: "HomeServer", client: SimpleHttpClient): + self._oembed_patterns = {} + for oembed_endpoint in hs.config.oembed.oembed_patterns: + for pattern in oembed_endpoint.url_patterns: + self._oembed_patterns[pattern] = oembed_endpoint.api_endpoint self._client = client def get_oembed_url(self, url: str) -> Optional[str]: @@ -95,7 +73,7 @@ def get_oembed_url(self, url: str) -> Optional[str]: Returns: A URL to use instead or None if the original URL should be used. """ - for url_pattern, endpoint in _oembed_patterns.items(): + for url_pattern, endpoint in self._oembed_patterns.items(): if url_pattern.fullmatch(url): return endpoint diff --git a/synapse/rest/media/v1/preview_url_resource.py b/synapse/rest/media/v1/preview_url_resource.py index 96669d92621d..317d333b1238 100644 --- a/synapse/rest/media/v1/preview_url_resource.py +++ b/synapse/rest/media/v1/preview_url_resource.py @@ -99,7 +99,7 @@ def __init__( self.primary_base_path = media_repo.primary_base_path self.media_storage = media_storage - self._oembed = OEmbedProvider(self.client) + self._oembed = OEmbedProvider(hs, self.client) # We run the background jobs if we're the instance specified (or no # instance is specified, where we assume there is only one instance diff --git a/tests/rest/media/v1/test_url_preview.py b/tests/rest/media/v1/test_url_preview.py index d56598621717..7fa902722770 100644 --- a/tests/rest/media/v1/test_url_preview.py +++ b/tests/rest/media/v1/test_url_preview.py @@ -14,13 +14,14 @@ import json import os import re -from unittest.mock import patch from twisted.internet._resolver import HostResolution from twisted.internet.address import IPv4Address, IPv6Address from twisted.internet.error import DNSLookupError from twisted.test.proto_helpers import AccumulatingProtocol +from synapse.config.oembed import OEmbedEndpointConfig + from tests import unittest from tests.server import FakeTransport @@ -81,6 +82,19 @@ def make_homeserver(self, reactor, clock): hs = self.setup_test_homeserver(config=config) + # After the hs is created, modify the parsed oEmbed config (to avoid + # messing with files). + # + # Note that HTTP URLs are used to avoid having to deal with TLS in tests. + hs.config.oembed.oembed_patterns = [ + OEmbedEndpointConfig( + api_endpoint="http://publish.twitter.com/oembed", + url_patterns=[ + re.compile(r"http://twitter\.com/.+/status/.+"), + ], + ) + ] + return hs def prepare(self, reactor, clock, hs): @@ -544,123 +558,101 @@ def test_accept_language_config_option(self): def test_oembed_photo(self): """Test an oEmbed endpoint which returns a 'photo' type which redirects the preview to a new URL.""" - # Route the HTTP version to an HTTP endpoint so that the tests work. - with patch.dict( - "synapse.rest.media.v1.oembed._oembed_patterns", - { - re.compile( - r"http://twitter\.com/.+/status/.+" - ): "http://publish.twitter.com/oembed", - }, - clear=True, - ): - - self.lookups["publish.twitter.com"] = [(IPv4Address, "10.1.2.3")] - self.lookups["cdn.twitter.com"] = [(IPv4Address, "10.1.2.3")] - - result = { - "version": "1.0", - "type": "photo", - "url": "http://cdn.twitter.com/matrixdotorg", - } - oembed_content = json.dumps(result).encode("utf-8") - - end_content = ( - b"" - b"Some Title" - b'' - b"" - ) + self.lookups["publish.twitter.com"] = [(IPv4Address, "10.1.2.3")] + self.lookups["cdn.twitter.com"] = [(IPv4Address, "10.1.2.3")] - channel = self.make_request( - "GET", - "preview_url?url=http://twitter.com/matrixdotorg/status/12345", - shorthand=False, - await_result=False, - ) - self.pump() - - client = self.reactor.tcpClients[0][2].buildProtocol(None) - server = AccumulatingProtocol() - server.makeConnection(FakeTransport(client, self.reactor)) - client.makeConnection(FakeTransport(server, self.reactor)) - client.dataReceived( - ( - b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\n" - b'Content-Type: application/json; charset="utf8"\r\n\r\n' - ) - % (len(oembed_content),) - + oembed_content - ) + result = { + "version": "1.0", + "type": "photo", + "url": "http://cdn.twitter.com/matrixdotorg", + } + oembed_content = json.dumps(result).encode("utf-8") - self.pump() - - client = self.reactor.tcpClients[1][2].buildProtocol(None) - server = AccumulatingProtocol() - server.makeConnection(FakeTransport(client, self.reactor)) - client.makeConnection(FakeTransport(server, self.reactor)) - client.dataReceived( - ( - b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\n" - b'Content-Type: text/html; charset="utf8"\r\n\r\n' - ) - % (len(end_content),) - + end_content + end_content = ( + b"" + b"Some Title" + b'' + b"" + ) + + channel = self.make_request( + "GET", + "preview_url?url=http://twitter.com/matrixdotorg/status/12345", + shorthand=False, + await_result=False, + ) + self.pump() + + client = self.reactor.tcpClients[0][2].buildProtocol(None) + server = AccumulatingProtocol() + server.makeConnection(FakeTransport(client, self.reactor)) + client.makeConnection(FakeTransport(server, self.reactor)) + client.dataReceived( + ( + b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\n" + b'Content-Type: application/json; charset="utf8"\r\n\r\n' ) + % (len(oembed_content),) + + oembed_content + ) - self.pump() + self.pump() - self.assertEqual(channel.code, 200) - self.assertEqual( - channel.json_body, {"og:title": "Some Title", "og:description": "hi"} + client = self.reactor.tcpClients[1][2].buildProtocol(None) + server = AccumulatingProtocol() + server.makeConnection(FakeTransport(client, self.reactor)) + client.makeConnection(FakeTransport(server, self.reactor)) + client.dataReceived( + ( + b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\n" + b'Content-Type: text/html; charset="utf8"\r\n\r\n' ) + % (len(end_content),) + + end_content + ) + + self.pump() + + self.assertEqual(channel.code, 200) + self.assertEqual( + channel.json_body, {"og:title": "Some Title", "og:description": "hi"} + ) def test_oembed_rich(self): """Test an oEmbed endpoint which returns HTML content via the 'rich' type.""" - # Route the HTTP version to an HTTP endpoint so that the tests work. - with patch.dict( - "synapse.rest.media.v1.oembed._oembed_patterns", - { - re.compile( - r"http://twitter\.com/.+/status/.+" - ): "http://publish.twitter.com/oembed", - }, - clear=True, - ): - - self.lookups["publish.twitter.com"] = [(IPv4Address, "10.1.2.3")] - - result = { - "version": "1.0", - "type": "rich", - "html": "
Content Preview
", - } - end_content = json.dumps(result).encode("utf-8") - - channel = self.make_request( - "GET", - "preview_url?url=http://twitter.com/matrixdotorg/status/12345", - shorthand=False, - await_result=False, - ) - self.pump() - - client = self.reactor.tcpClients[0][2].buildProtocol(None) - server = AccumulatingProtocol() - server.makeConnection(FakeTransport(client, self.reactor)) - client.makeConnection(FakeTransport(server, self.reactor)) - client.dataReceived( - ( - b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\n" - b'Content-Type: application/json; charset="utf8"\r\n\r\n' - ) - % (len(end_content),) - + end_content - ) + self.lookups["publish.twitter.com"] = [(IPv4Address, "10.1.2.3")] + + result = { + "version": "1.0", + "type": "rich", + "html": "
Content Preview
", + } + end_content = json.dumps(result).encode("utf-8") + + channel = self.make_request( + "GET", + "preview_url?url=http://twitter.com/matrixdotorg/status/12345", + shorthand=False, + await_result=False, + ) + self.pump() - self.pump() - self.assertEqual(channel.code, 200) - self.assertEqual( - channel.json_body, - {"og:title": None, "og:description": "Content Preview"}, + client = self.reactor.tcpClients[0][2].buildProtocol(None) + server = AccumulatingProtocol() + server.makeConnection(FakeTransport(client, self.reactor)) + client.makeConnection(FakeTransport(server, self.reactor)) + client.dataReceived( + ( + b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\n" + b'Content-Type: application/json; charset="utf8"\r\n\r\n' ) + % (len(end_content),) + + end_content + ) + + self.pump() + self.assertEqual(channel.code, 200) + self.assertEqual( + channel.json_body, + {"og:title": None, "og:description": "Content Preview"}, + ) From 2cf294e8eb88163f92c1abe56de2003e9f1497a2 Mon Sep 17 00:00:00 2001 From: Patrick Cloke Date: Mon, 30 Aug 2021 08:13:40 -0400 Subject: [PATCH 3/8] Remove unused hard-coded values. --- synapse/rest/media/v1/oembed.py | 17 ----------------- 1 file changed, 17 deletions(-) diff --git a/synapse/rest/media/v1/oembed.py b/synapse/rest/media/v1/oembed.py index a2472d030f24..fef49499e9c3 100644 --- a/synapse/rest/media/v1/oembed.py +++ b/synapse/rest/media/v1/oembed.py @@ -24,23 +24,6 @@ logger = logging.getLogger(__name__) -# A map of globs to API endpoints. -_oembed_globs = { - # Twitter. - "https://publish.twitter.com/oembed": [ - "https://twitter.com/*/status/*", - "https://*.twitter.com/*/status/*", - "https://twitter.com/*/moments/*", - "https://*.twitter.com/*/moments/*", - # Include the HTTP versions too. - "http://twitter.com/*/status/*", - "http://*.twitter.com/*/status/*", - "http://twitter.com/*/moments/*", - "http://*.twitter.com/*/moments/*", - ], -} - - @attr.s(slots=True) class OEmbedResult: # Either HTML content or URL must be provided. From 86068a038a334e9f1312c9d825d0cadbc0d4c7f0 Mon Sep 17 00:00:00 2001 From: Patrick Cloke Date: Mon, 30 Aug 2021 08:16:02 -0400 Subject: [PATCH 4/8] Add a docstring. --- synapse/rest/media/v1/oembed.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/synapse/rest/media/v1/oembed.py b/synapse/rest/media/v1/oembed.py index fef49499e9c3..5ba7694803b4 100644 --- a/synapse/rest/media/v1/oembed.py +++ b/synapse/rest/media/v1/oembed.py @@ -39,6 +39,13 @@ class OEmbedError(Exception): class OEmbedProvider: + """ + A helper for accessing oEmbed content. + + It can be used to check if a URL should be accessed via oEmbed and for + requesting/parsing oEmbed content. + """ + def __init__(self, hs: "HomeServer", client: SimpleHttpClient): self._oembed_patterns = {} for oembed_endpoint in hs.config.oembed.oembed_patterns: From 911aee285c966b0e9b978f7dd80e8d5372d366bb Mon Sep 17 00:00:00 2001 From: Patrick Cloke Date: Mon, 30 Aug 2021 08:16:34 -0400 Subject: [PATCH 5/8] Use auto-attribs. --- synapse/rest/media/v1/oembed.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/synapse/rest/media/v1/oembed.py b/synapse/rest/media/v1/oembed.py index 5ba7694803b4..afe41823e4f5 100644 --- a/synapse/rest/media/v1/oembed.py +++ b/synapse/rest/media/v1/oembed.py @@ -24,14 +24,14 @@ logger = logging.getLogger(__name__) -@attr.s(slots=True) +@attr.s(slots=True, auto_attribs=True) class OEmbedResult: # Either HTML content or URL must be provided. - html = attr.ib(type=Optional[str]) - url = attr.ib(type=Optional[str]) - title = attr.ib(type=Optional[str]) + html: Optional[str] + url: Optional[str] + title: Optional[str] # Number of seconds to cache the content. - cache_age = attr.ib(type=int) + cache_age: int class OEmbedError(Exception): From db8506438401266dfe878c2e00ddcaa8a92cd306 Mon Sep 17 00:00:00 2001 From: Patrick Cloke Date: Mon, 30 Aug 2021 08:27:59 -0400 Subject: [PATCH 6/8] Fix info in sample config. --- docs/sample_config.yaml | 3 +-- synapse/config/oembed.py | 3 +-- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/docs/sample_config.yaml b/docs/sample_config.yaml index 62a31efb9129..fe69f64545a1 100644 --- a/docs/sample_config.yaml +++ b/docs/sample_config.yaml @@ -1076,8 +1076,7 @@ url_preview_accept_language: oembed: - # By default, the providers.json from https://oembed.com/ is included - # with Synapse. + # A list of providers included with Synapse. # # Uncomment the following to disable using these default oEmbed URLs. # Defaults to 'false'. diff --git a/synapse/config/oembed.py b/synapse/config/oembed.py index 6d5110195695..9190c2a4bf3f 100644 --- a/synapse/config/oembed.py +++ b/synapse/config/oembed.py @@ -130,8 +130,7 @@ def _glob_to_pattern(self, glob: str, config_path: Iterable[str]) -> Pattern: def generate_config_section(self, **kwargs): return """\ oembed: - # By default, the providers.json from https://oembed.com/ is included - # with Synapse. + # A list of providers included with Synapse. # # Uncomment the following to disable using these default oEmbed URLs. # Defaults to 'false'. From 355129693ef1e02e127dbe61fb11e0de2016e6bf Mon Sep 17 00:00:00 2001 From: Patrick Cloke Date: Tue, 31 Aug 2021 13:55:22 -0400 Subject: [PATCH 7/8] Fix typo. Co-authored-by: Richard van der Hoff <1389908+richvdh@users.noreply.github.com> --- changelog.d/10714.feature | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/changelog.d/10714.feature b/changelog.d/10714.feature index bcb5a199f629..7d18f5c1330f 100644 --- a/changelog.d/10714.feature +++ b/changelog.d/10714.feature @@ -1 +1 @@ -Allow configuration the oEmbed URLs used for URL previews. +Allow configuration of the oEmbed URLs used for URL previews. From 7fedb388a5143fd5bdd3c4f371bb155188263038 Mon Sep 17 00:00:00 2001 From: Patrick Cloke Date: Tue, 31 Aug 2021 13:56:04 -0400 Subject: [PATCH 8/8] Review comments. --- docs/sample_config.yaml | 31 +++++++++++++++++-------------- synapse/config/oembed.py | 33 ++++++++++++++++++--------------- 2 files changed, 35 insertions(+), 29 deletions(-) diff --git a/docs/sample_config.yaml b/docs/sample_config.yaml index fe69f64545a1..e155b978d854 100644 --- a/docs/sample_config.yaml +++ b/docs/sample_config.yaml @@ -1075,22 +1075,25 @@ url_preview_accept_language: # - en +# oEmbed allows for easier embedding content from a website. It can be +# used for generating URLs previews of services which support it. +# oembed: - # A list of providers included with Synapse. - # - # Uncomment the following to disable using these default oEmbed URLs. - # Defaults to 'false'. - # - #disable_default_providers: true + # A default list of oEmbed providers is included with Synapse. + # + # Uncomment the following to disable using these default oEmbed URLs. + # Defaults to 'false'. + # + #disable_default_providers: true - # Additional files with oEmbed configuration (each should be in the - # form of providers.json). - # - # By default, this list is empty (so only the default providers.json - # is used). - # - #additional_providers: - # - oembed/my_providers.json + # Additional files with oEmbed configuration (each should be in the + # form of providers.json). + # + # By default, this list is empty (so only the default providers.json + # is used). + # + #additional_providers: + # - oembed/my_providers.json ## Captcha ## diff --git a/synapse/config/oembed.py b/synapse/config/oembed.py index 9190c2a4bf3f..09267b5eefcf 100644 --- a/synapse/config/oembed.py +++ b/synapse/config/oembed.py @@ -129,22 +129,25 @@ def _glob_to_pattern(self, glob: str, config_path: Iterable[str]) -> Pattern: def generate_config_section(self, **kwargs): return """\ + # oEmbed allows for easier embedding content from a website. It can be + # used for generating URLs previews of services which support it. + # oembed: - # A list of providers included with Synapse. - # - # Uncomment the following to disable using these default oEmbed URLs. - # Defaults to 'false'. - # - #disable_default_providers: true - - # Additional files with oEmbed configuration (each should be in the - # form of providers.json). - # - # By default, this list is empty (so only the default providers.json - # is used). - # - #additional_providers: - # - oembed/my_providers.json + # A default list of oEmbed providers is included with Synapse. + # + # Uncomment the following to disable using these default oEmbed URLs. + # Defaults to 'false'. + # + #disable_default_providers: true + + # Additional files with oEmbed configuration (each should be in the + # form of providers.json). + # + # By default, this list is empty (so only the default providers.json + # is used). + # + #additional_providers: + # - oembed/my_providers.json """