Skip to content
This repository has been archived by the owner on Apr 26, 2024. It is now read-only.

Improve oEmbed previews #10819

Merged
merged 6 commits into from
Sep 22, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions changelog.d/10819.feature
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Improve oEmbed previews by processing the author name, photo, and video information.
49 changes: 45 additions & 4 deletions synapse/rest/media/v1/oembed.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
# limitations under the License.
import logging
import urllib.parse
from typing import TYPE_CHECKING, Optional
from typing import TYPE_CHECKING, List, Optional

import attr

Expand All @@ -22,6 +22,8 @@
from synapse.util import json_decoder

if TYPE_CHECKING:
from lxml import etree

from synapse.server import HomeServer

logger = logging.getLogger(__name__)
Expand All @@ -31,7 +33,7 @@
class OEmbedResult:
# The Open Graph result (converted from the oEmbed result).
open_graph_result: JsonDict
# Number of seconds to cache the content, according to the oEmbed response.
# Number of milliseconds to cache the content, according to the oEmbed response.
#
# This will be None if no cache-age is provided in the oEmbed response (or
# if the oEmbed response cannot be turned into an Open Graph response).
Expand Down Expand Up @@ -119,10 +121,22 @@ def parse_oembed_response(self, url: str, raw_body: bytes) -> OEmbedResult:
# Ensure the cache age is None or an int.
cache_age = oembed.get("cache_age")
if cache_age:
cache_age = int(cache_age)
cache_age = int(cache_age) * 1000

# The results.
open_graph_response = {"og:title": oembed.get("title")}
open_graph_response = {
"og:url": url,
}
squahtx marked this conversation as resolved.
Show resolved Hide resolved

# Use either title or author's name as the title.
title = oembed.get("title") or oembed.get("author_name")
if title:
open_graph_response["og:title"] = title

# Use the provider name and as the site.
provider_name = oembed.get("provider_name")
if provider_name:
open_graph_response["og:site_name"] = provider_name

# If a thumbnail exists, use it. Note that dimensions will be calculated later.
if "thumbnail_url" in oembed:
Expand All @@ -137,6 +151,15 @@ def parse_oembed_response(self, url: str, raw_body: bytes) -> OEmbedResult:
# If this is a photo, use the full image, not the thumbnail.
open_graph_response["og:image"] = oembed["url"]

elif oembed_type == "video":
open_graph_response["og:type"] = "video.other"
calc_description_and_urls(open_graph_response, oembed["html"])
open_graph_response["og:video:width"] = oembed["width"]
open_graph_response["og:video:height"] = oembed["height"]

elif oembed_type == "link":
open_graph_response["og:type"] = "website"

else:
raise RuntimeError(f"Unknown oEmbed type: {oembed_type}")

Expand All @@ -149,6 +172,14 @@ def parse_oembed_response(self, url: str, raw_body: bytes) -> OEmbedResult:
return OEmbedResult(open_graph_response, cache_age)


def _fetch_urls(tree: "etree.Element", tag_name: str) -> List[str]:
results = []
for tag in tree.xpath("//*/" + tag_name):
if "src" in tag.attrib:
results.append(tag.attrib["src"])
return results


def calc_description_and_urls(open_graph_response: JsonDict, html_body: str) -> None:
"""
Calculate description for an HTML document.
Expand Down Expand Up @@ -179,6 +210,16 @@ def calc_description_and_urls(open_graph_response: JsonDict, html_body: str) ->
if tree is None:
return

# Attempt to find interesting URLs (images, videos, embeds).
if "og:image" not in open_graph_response:
image_urls = _fetch_urls(tree, "img")
if image_urls:
open_graph_response["og:image"] = image_urls[0]

video_urls = _fetch_urls(tree, "video") + _fetch_urls(tree, "embed")
if video_urls:
open_graph_response["og:video"] = video_urls[0]

from synapse.rest.media.v1.preview_url_resource import _calc_description

description = _calc_description(tree)
Expand Down
2 changes: 1 addition & 1 deletion synapse/rest/media/v1/preview_url_resource.py
Original file line number Diff line number Diff line change
Expand Up @@ -305,7 +305,7 @@ async def _do_preview(self, url: str, user: str, ts: int) -> bytes:
with open(media_info.filename, "rb") as file:
body = file.read()

oembed_response = self._oembed.parse_oembed_response(media_info.uri, body)
oembed_response = self._oembed.parse_oembed_response(url, body)
og = oembed_response.open_graph_result

# Use the cache age from the oEmbed result, instead of the HTTP response.
Expand Down
30 changes: 21 additions & 9 deletions tests/rest/media/v1/test_url_preview.py
Original file line number Diff line number Diff line change
Expand Up @@ -620,11 +620,12 @@ def test_oembed_photo(self):
self.assertIn(b"/matrixdotorg", server.data)

self.assertEqual(channel.code, 200)
self.assertIsNone(channel.json_body["og:title"])
self.assertTrue(channel.json_body["og:image"].startswith("mxc://"))
self.assertEqual(channel.json_body["og:image:height"], 1)
self.assertEqual(channel.json_body["og:image:width"], 1)
self.assertEqual(channel.json_body["og:image:type"], "image/png")
body = channel.json_body
self.assertEqual(body["og:url"], "http://twitter.com/matrixdotorg/status/12345")
self.assertTrue(body["og:image"].startswith("mxc://"))
self.assertEqual(body["og:image:height"], 1)
self.assertEqual(body["og:image:width"], 1)
self.assertEqual(body["og:image:type"], "image/png")

def test_oembed_rich(self):
"""Test an oEmbed endpoint which returns HTML content via the 'rich' type."""
Expand All @@ -633,6 +634,8 @@ def test_oembed_rich(self):
result = {
"version": "1.0",
"type": "rich",
# Note that this provides the author, not the title.
"author_name": "Alice",
"html": "<div>Content Preview</div>",
}
end_content = json.dumps(result).encode("utf-8")
Expand Down Expand Up @@ -660,9 +663,14 @@ def test_oembed_rich(self):

self.pump()
self.assertEqual(channel.code, 200)
body = channel.json_body
self.assertEqual(
channel.json_body,
{"og:title": None, "og:description": "Content Preview"},
body,
{
"og:url": "http://twitter.com/matrixdotorg/status/12345",
"og:title": "Alice",
"og:description": "Content Preview",
},
)

def test_oembed_format(self):
Expand Down Expand Up @@ -705,7 +713,11 @@ def test_oembed_format(self):
self.assertIn(b"format=json", server.data)

self.assertEqual(channel.code, 200)
body = channel.json_body
self.assertEqual(
channel.json_body,
{"og:title": None, "og:description": "Content Preview"},
body,
{
"og:url": "http://www.hulu.com/watch/12345",
"og:description": "Content Preview",
},
)