matrix-org · clokep · Sep 21, 2021 · Sep 8, 2021 · Sep 8, 2021 · Sep 8, 2021
@@ -258,6 +258,9 @@ async def _do_preview(self, url: str, user: str, ts: int) -> bytes:
 
         logger.debug("got media_info of '%s'", media_info)
 
+        # The timestamp of when this media expires.
+        expiration_ts_ms = media_info.expires + media_info.created_ts_ms
+
         if _is_media(media_info.media_type):
             file_id = media_info.filesystem_id
             dims = await self.media_repo._generate_thumbnails(
@@ -287,34 +290,8 @@ async def _do_preview(self, url: str, user: str, ts: int) -> bytes:
             encoding = get_html_media_encoding(body, media_info.media_type)
             og = decode_and_calc_og(body, media_info.uri, encoding)
 
-            # pre-cache the image for posterity
-            # FIXME: it might be cleaner to use the same flow as the main /preview_url
-            # request itself and benefit from the same caching etc.  But for now we
-            # just rely on the caching on the master request to speed things up.
-            if "og:image" in og and og["og:image"]:
-                image_info = await self._download_url(
-                    _rebase_url(og["og:image"], media_info.uri), user
-                )
+            await self._precache_image_url(user, media_info, og)
 
-                if _is_media(image_info.media_type):
-                    # TODO: make sure we don't choke on white-on-transparent images
-                    file_id = image_info.filesystem_id
-                    dims = await self.media_repo._generate_thumbnails(
-                        None, file_id, file_id, image_info.media_type, url_cache=True
-                    )
-                    if dims:
-                        og["og:image:width"] = dims["width"]
-                        og["og:image:height"] = dims["height"]
-                    else:
-                        logger.warning("Couldn't get dims for %s", og["og:image"])
-
-                    og[
-                        "og:image"
-                    ] = f"mxc://{self.server_name}/{image_info.filesystem_id}"
-                    og["og:image:type"] = image_info.media_type
-                    og["matrix:image:size"] = image_info.media_length
-                else:
-                    del og["og:image"]
         else:
             logger.warning("Failed to find any OG data in %s", url)
             og = {}
@@ -340,7 +317,7 @@ async def _do_preview(self, url: str, user: str, ts: int) -> bytes:
             url,
             media_info.response_code,
             media_info.etag,
-            media_info.expires + media_info.created_ts_ms,
+            expiration_ts_ms,
             jsonog,
             media_info.filesystem_id,
             media_info.created_ts_ms,
@@ -473,6 +450,44 @@ async def _download_url(self, url: str, user: str) -> MediaInfo:
             etag=etag,
         )
 
+    async def _precache_image_url(
+        self, user: str, media_info: MediaInfo, og: JsonDict
+    ) -> None:
+        """
+        Pre-cache the image (if one exists) for posterity
+
+        Args:
+            user: The user requesting the preview.
+            media_info: The media being previewed.
+            og: The Open Graph dictionary. This is modified with image information.
+        """
+        #
+        # FIXME: it might be cleaner to use the same flow as the main /preview_url
+        # request itself and benefit from the same caching etc.  But for now we
+        # just rely on the caching on the master request to speed things up.
+        if "og:image" in og and og["og:image"]:
+            image_info = await self._download_url(
+                _rebase_url(og["og:image"], media_info.uri), user
+            )
+
+            if _is_media(image_info.media_type):
+                # TODO: make sure we don't choke on white-on-transparent images
+                file_id = image_info.filesystem_id
+                dims = await self.media_repo._generate_thumbnails(
+                    None, file_id, file_id, image_info.media_type, url_cache=True
+                )
+                if dims:
+                    og["og:image:width"] = dims["width"]
+                    og["og:image:height"] = dims["height"]
+                else:
+                    logger.warning("Couldn't get dims for %s", og["og:image"])
+
+                og["og:image"] = f"mxc://{self.server_name}/{image_info.filesystem_id}"
+                og["og:image:type"] = image_info.media_type
+                og["matrix:image:size"] = image_info.media_length
+            else:
+                del og["og:image"]
+
     def _start_expire_url_cache_data(self):
         return run_as_background_process(
             "expire_url_cache_data", self._expire_url_cache_data
@@ -668,7 +683,18 @@ def _attempt_calc_og(body_attempt: Union[bytes, str]) -> Dict[str, Optional[str]
 
 
 def _calc_og(tree: "etree.Element", media_uri: str) -> Dict[str, Optional[str]]:
-    # suck our tree into lxml and define our OG response.
+    """
+    Calculate metadata for an HTML document.
+
+    This uses lxml to search the HTML document for Open Graph data.
+
+    Args:
+        tree: The parsed HTML document.
+        media_url: The URI used to download the body.
+
+    Returns:
+        The Open Graph response as a dictionary.
+    """
 
     # if we see any image URLs in the OG response, then spider them
     # (although the client could choose to do this by asking for previews of those
@@ -742,35 +768,7 @@ def _calc_og(tree: "etree.Element", media_uri: str) -> Dict[str, Optional[str]]:
         if meta_description:
             og["og:description"] = meta_description[0]
         else:
-            # grab any text nodes which are inside the <body/> tag...
-            # unless they are within an HTML5 semantic markup tag...
-            # <header/>, <nav/>, <aside/>, <footer/>
-            # ...or if they are within a <script/> or <style/> tag.
-            # This is a very very very coarse approximation to a plain text
-            # render of the page.
-
-            # We don't just use XPATH here as that is slow on some machines.
-
-            from lxml import etree
-
-            TAGS_TO_REMOVE = (
-                "header",
-                "nav",
-                "aside",
-                "footer",
-                "script",
-                "noscript",
-                "style",
-                etree.Comment,
-            )
-
-            # Split all the text nodes into paragraphs (by splitting on new
-            # lines)
-            text_nodes = (
-                re.sub(r"\s+", "\n", el).strip()
-                for el in _iterate_over_text(tree.find("body"), *TAGS_TO_REMOVE)
-            )
-            og["og:description"] = summarize_paragraphs(text_nodes)
+            og["og:description"] = _calc_description(tree)
     elif og["og:description"]:
         # This must be a non-empty string at this point.
         assert isinstance(og["og:description"], str)
@@ -781,6 +779,46 @@ def _calc_og(tree: "etree.Element", media_uri: str) -> Dict[str, Optional[str]]:
     return og
 
 
+def _calc_description(tree: "etree.Element") -> Optional[str]:
+    """
+    Calculate a text description based on an HTML document.
+
+    Grabs any text nodes which are inside the <body/> tag, unless they are within
+    an HTML5 semantic markup tag (<header/>, <nav/>, <aside/>, <footer/>), or
+    if they are within a <script/> or <style/> tag.
+
+    This is a very very very coarse approximation to a plain text render of the page.
+
+    Args:
+        tree: The parsed HTML document.
+
+    Returns:
+        The plain text description, or None if one cannot be generated.
+    """
+    # We don't just use XPATH here as that is slow on some machines.
+
+    from lxml import etree
+
+    TAGS_TO_REMOVE = (
+        "header",
+        "nav",
+        "aside",
+        "footer",
+        "script",
+        "noscript",
+        "style",
+        etree.Comment,
+    )
+
+    # Split all the text nodes into paragraphs (by splitting on new
+    # lines)
+    text_nodes = (
+        re.sub(r"\s+", "\n", el).strip()
+        for el in _iterate_over_text(tree.find("body"), *TAGS_TO_REMOVE)
+    )
+    return summarize_paragraphs(text_nodes)
+
+
 def _iterate_over_text(
     tree, *tags_to_ignore: Iterable[Union[str, "etree.Comment"]]
 ) -> Generator[str, None, None]:
@@ -843,8 +881,18 @@ def _is_html(content_type: str) -> bool:
 def summarize_paragraphs(
     text_nodes: Iterable[str], min_size: int = 200, max_size: int = 500
 ) -> Optional[str]:
-    # Try to get a summary of between 200 and 500 words, respecting
-    # first paragraph and then word boundaries.
+    """
+    Try to get a summary respecting first paragraph and then word boundaries.
+
+    Args:
+        text_nodes: The paragraphs to summarize.
+        min_size: The minimum number of words to include.
+        max_size: The maximum number of words to include.
+
+    Returns:
+        A summary of the text nodes, or None if that was not possible.
+    """
+
     # TODO: Respect sentences?
 
     description = ""
@@ -867,7 +915,7 @@ def summarize_paragraphs(
         new_desc = ""
 
         # This splits the paragraph into words, but keeping the
-        # (preceeding) whitespace intact so we can easily concat
+        # (preceding) whitespace intact so we can easily concat
         # words back together.
         for match in re.finditer(r"\s*\S+", description):
             word = match.group()