Skip to content
This repository has been archived by the owner on Apr 26, 2024. It is now read-only.

Refactor oEmbed previews #10814

Merged
merged 21 commits into from
Sep 21, 2021
Merged
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
170 changes: 109 additions & 61 deletions synapse/rest/media/v1/preview_url_resource.py
Original file line number Diff line number Diff line change
Expand Up @@ -258,6 +258,9 @@ async def _do_preview(self, url: str, user: str, ts: int) -> bytes:

logger.debug("got media_info of '%s'", media_info)

# The timestamp of when this media expires.
expiration_ts_ms = media_info.expires + media_info.created_ts_ms

if _is_media(media_info.media_type):
file_id = media_info.filesystem_id
dims = await self.media_repo._generate_thumbnails(
Expand Down Expand Up @@ -287,34 +290,8 @@ async def _do_preview(self, url: str, user: str, ts: int) -> bytes:
encoding = get_html_media_encoding(body, media_info.media_type)
og = decode_and_calc_og(body, media_info.uri, encoding)

# pre-cache the image for posterity
# FIXME: it might be cleaner to use the same flow as the main /preview_url
# request itself and benefit from the same caching etc. But for now we
# just rely on the caching on the master request to speed things up.
if "og:image" in og and og["og:image"]:
image_info = await self._download_url(
_rebase_url(og["og:image"], media_info.uri), user
)
await self._precache_image_url(user, media_info, og)

if _is_media(image_info.media_type):
# TODO: make sure we don't choke on white-on-transparent images
file_id = image_info.filesystem_id
dims = await self.media_repo._generate_thumbnails(
None, file_id, file_id, image_info.media_type, url_cache=True
)
if dims:
og["og:image:width"] = dims["width"]
og["og:image:height"] = dims["height"]
else:
logger.warning("Couldn't get dims for %s", og["og:image"])

og[
"og:image"
] = f"mxc://{self.server_name}/{image_info.filesystem_id}"
og["og:image:type"] = image_info.media_type
og["matrix:image:size"] = image_info.media_length
else:
del og["og:image"]
else:
logger.warning("Failed to find any OG data in %s", url)
og = {}
Expand All @@ -340,7 +317,7 @@ async def _do_preview(self, url: str, user: str, ts: int) -> bytes:
url,
media_info.response_code,
media_info.etag,
media_info.expires + media_info.created_ts_ms,
expiration_ts_ms,
jsonog,
media_info.filesystem_id,
media_info.created_ts_ms,
Expand Down Expand Up @@ -473,6 +450,44 @@ async def _download_url(self, url: str, user: str) -> MediaInfo:
etag=etag,
)

async def _precache_image_url(
self, user: str, media_info: MediaInfo, og: JsonDict
) -> None:
"""
Pre-cache the image (if one exists) for posterity

Args:
user: The user requesting the preview.
media_info: The media being previewed.
og: The Open Graph dictionary. This is modified with image information.
"""
#
# FIXME: it might be cleaner to use the same flow as the main /preview_url
# request itself and benefit from the same caching etc. But for now we
# just rely on the caching on the master request to speed things up.
if "og:image" in og and og["og:image"]:
clokep marked this conversation as resolved.
Show resolved Hide resolved
image_info = await self._download_url(
_rebase_url(og["og:image"], media_info.uri), user
)

if _is_media(image_info.media_type):
# TODO: make sure we don't choke on white-on-transparent images
file_id = image_info.filesystem_id
dims = await self.media_repo._generate_thumbnails(
None, file_id, file_id, image_info.media_type, url_cache=True
)
if dims:
og["og:image:width"] = dims["width"]
og["og:image:height"] = dims["height"]
else:
logger.warning("Couldn't get dims for %s", og["og:image"])

og["og:image"] = f"mxc://{self.server_name}/{image_info.filesystem_id}"
og["og:image:type"] = image_info.media_type
og["matrix:image:size"] = image_info.media_length
else:
del og["og:image"]

def _start_expire_url_cache_data(self):
return run_as_background_process(
"expire_url_cache_data", self._expire_url_cache_data
Expand Down Expand Up @@ -668,7 +683,18 @@ def _attempt_calc_og(body_attempt: Union[bytes, str]) -> Dict[str, Optional[str]


def _calc_og(tree: "etree.Element", media_uri: str) -> Dict[str, Optional[str]]:
# suck our tree into lxml and define our OG response.
"""
Calculate metadata for an HTML document.

This uses lxml to search the HTML document for Open Graph data.

Args:
tree: The parsed HTML document.
media_url: The URI used to download the body.

Returns:
The Open Graph response as a dictionary.
"""

# if we see any image URLs in the OG response, then spider them
# (although the client could choose to do this by asking for previews of those
Expand Down Expand Up @@ -742,35 +768,7 @@ def _calc_og(tree: "etree.Element", media_uri: str) -> Dict[str, Optional[str]]:
if meta_description:
og["og:description"] = meta_description[0]
else:
# grab any text nodes which are inside the <body/> tag...
# unless they are within an HTML5 semantic markup tag...
# <header/>, <nav/>, <aside/>, <footer/>
# ...or if they are within a <script/> or <style/> tag.
# This is a very very very coarse approximation to a plain text
# render of the page.

# We don't just use XPATH here as that is slow on some machines.

from lxml import etree

TAGS_TO_REMOVE = (
"header",
"nav",
"aside",
"footer",
"script",
"noscript",
"style",
etree.Comment,
)

# Split all the text nodes into paragraphs (by splitting on new
# lines)
text_nodes = (
re.sub(r"\s+", "\n", el).strip()
for el in _iterate_over_text(tree.find("body"), *TAGS_TO_REMOVE)
)
og["og:description"] = summarize_paragraphs(text_nodes)
og["og:description"] = _calc_description(tree)
elif og["og:description"]:
# This must be a non-empty string at this point.
assert isinstance(og["og:description"], str)
Expand All @@ -781,6 +779,46 @@ def _calc_og(tree: "etree.Element", media_uri: str) -> Dict[str, Optional[str]]:
return og


def _calc_description(tree: "etree.Element") -> Optional[str]:
"""
Calculate a text description based on an HTML document.

Grabs any text nodes which are inside the <body/> tag, unless they are within
an HTML5 semantic markup tag (<header/>, <nav/>, <aside/>, <footer/>), or
if they are within a <script/> or <style/> tag.

This is a very very very coarse approximation to a plain text render of the page.

Args:
tree: The parsed HTML document.

Returns:
The plain text description, or None if one cannot be generated.
"""
# We don't just use XPATH here as that is slow on some machines.

from lxml import etree

TAGS_TO_REMOVE = (
"header",
"nav",
"aside",
"footer",
"script",
"noscript",
"style",
etree.Comment,
)

# Split all the text nodes into paragraphs (by splitting on new
# lines)
text_nodes = (
re.sub(r"\s+", "\n", el).strip()
for el in _iterate_over_text(tree.find("body"), *TAGS_TO_REMOVE)
)
return summarize_paragraphs(text_nodes)


def _iterate_over_text(
tree, *tags_to_ignore: Iterable[Union[str, "etree.Comment"]]
) -> Generator[str, None, None]:
Expand Down Expand Up @@ -843,8 +881,18 @@ def _is_html(content_type: str) -> bool:
def summarize_paragraphs(
text_nodes: Iterable[str], min_size: int = 200, max_size: int = 500
) -> Optional[str]:
# Try to get a summary of between 200 and 500 words, respecting
# first paragraph and then word boundaries.
"""
Try to get a summary respecting first paragraph and then word boundaries.

Args:
text_nodes: The paragraphs to summarize.
min_size: The minimum number of words to include.
max_size: The maximum number of words to include.

Returns:
A summary of the text nodes, or None if that was not possible.
"""

# TODO: Respect sentences?

description = ""
Expand All @@ -867,7 +915,7 @@ def summarize_paragraphs(
new_desc = ""

# This splits the paragraph into words, but keeping the
# (preceeding) whitespace intact so we can easily concat
# (preceding) whitespace intact so we can easily concat
# words back together.
for match in re.finditer(r"\s*\S+", description):
word = match.group()
Expand Down