From 230a2cc6d7dc701afb801429eb1db69a5410a13f Mon Sep 17 00:00:00 2001
From: Patrick Cloke <patrickc@matrix.org>
Date: Fri, 5 Feb 2021 12:44:44 -0500
Subject: [PATCH 1/5] Handle lxml not finding an XML tree.

---
 synapse/rest/media/v1/preview_url_resource.py |  5 ++++
 tests/test_preview.py                         | 25 ++++++++++++-------
 2 files changed, 21 insertions(+), 9 deletions(-)
diff --git a/synapse/rest/media/v1/preview_url_resource.py b/synapse/rest/media/v1/preview_url_resource.py
index bf3be653aa6c..16d16890a487 100644
--- a/synapse/rest/media/v1/preview_url_resource.py
+++ b/synapse/rest/media/v1/preview_url_resource.py
@@ -725,6 +725,11 @@ def decode_and_calc_og(
     def _attempt_calc_og(body_attempt: Union[bytes, str]) -> Dict[str, Optional[str]]:
         # Attempt to parse the body. If this fails, log and return no metadata.
         tree = etree.fromstring(body_attempt, parser)
+
+        # The data was successfully parsed, but no tree was found.
+        if tree is None:
+            return {}
+
         return _calc_og(tree, media_uri)
 
     # Attempt to parse the body. If this fails, log and return no metadata.
diff --git a/tests/test_preview.py b/tests/test_preview.py
index 0c6cbbd92148..10559f4e9937 100644
--- a/tests/test_preview.py
+++ b/tests/test_preview.py
@@ -149,7 +149,7 @@ class PreviewUrlTestCase(unittest.TestCase):
         skip = "url preview feature requires lxml"
 
     def test_simple(self):
-        html = """
+        html = b"""
         <html>
         <head><title>Foo</title></head>
         <body>
@@ -163,7 +163,7 @@ def test_simple(self):
         self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."})
 
     def test_comment(self):
-        html = """
+        html = b"""
         <html>
         <head><title>Foo</title></head>
         <body>
@@ -178,7 +178,7 @@ def test_comment(self):
         self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."})
 
     def test_comment2(self):
-        html = """
+        html = b"""
         <html>
         <head><title>Foo</title></head>
         <body>
@@ -202,7 +202,7 @@ def test_comment2(self):
         )
 
     def test_script(self):
-        html = """
+        html = b"""
         <html>
         <head><title>Foo</title></head>
         <body>
@@ -217,7 +217,7 @@ def test_script(self):
         self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."})
 
     def test_missing_title(self):
-        html = """
+        html = b"""
         <html>
         <body>
         Some text.
@@ -230,7 +230,7 @@ def test_missing_title(self):
         self.assertEqual(og, {"og:title": None, "og:description": "Some text."})
 
     def test_h1_as_title(self):
-        html = """
+        html = b"""
         <html>
         <meta property="og:description" content="Some text."/>
         <body>
@@ -244,7 +244,7 @@ def test_h1_as_title(self):
         self.assertEqual(og, {"og:title": "Title", "og:description": "Some text."})
 
     def test_missing_title_and_broken_h1(self):
-        html = """
+        html = b"""
         <html>
         <body>
         <h1><a href="foo"/></h1>
@@ -258,13 +258,20 @@ def test_missing_title_and_broken_h1(self):
         self.assertEqual(og, {"og:title": None, "og:description": "Some text."})
 
     def test_empty(self):
-        html = ""
+        """Test a body with no data in it."""
+        html = b""
+        og = decode_and_calc_og(html, "http://example.com/test.html")
+        self.assertEqual(og, {})
+
+    def test_no_tree(self):
+        """A valid body with no tree in it."""
+        html = b"\x00"
         og = decode_and_calc_og(html, "http://example.com/test.html")
         self.assertEqual(og, {})
 
     def test_invalid_encoding(self):
         """An invalid character encoding should be ignored and treated as UTF-8, if possible."""
-        html = """
+        html = b"""
         <html>
         <head><title>Foo</title></head>
         <body>

From 9c9602402fe7832686f72cf1b75f0af35458045c Mon Sep 17 00:00:00 2001
From: Patrick Cloke <patrickc@matrix.org>
Date: Fri, 5 Feb 2021 13:17:22 -0500
Subject: [PATCH 2/5] Split out code for getting charsets and add tests.

---
 synapse/rest/media/v1/preview_url_resource.py | 51 ++++++++++++-------
 tests/test_preview.py                         | 39 +++++++++++++-
 2 files changed, 70 insertions(+), 20 deletions(-)

diff --git a/synapse/rest/media/v1/preview_url_resource.py b/synapse/rest/media/v1/preview_url_resource.py
index 16d16890a487..954364ccab4d 100644
--- a/synapse/rest/media/v1/preview_url_resource.py
+++ b/synapse/rest/media/v1/preview_url_resource.py
@@ -300,24 +300,7 @@ async def _do_preview(self, url: str, user: str, ts: int) -> bytes:
             with open(media_info["filename"], "rb") as file:
                 body = file.read()
 
-            encoding = None
-
-            # Let's try and figure out if it has an encoding set in a meta tag.
-            # Limit it to the first 1kb, since it ought to be in the meta tags
-            # at the top.
-            match = _charset_match.search(body[:1000])
-
-            # If we find a match, it should take precedence over the
-            # Content-Type header, so set it here.
-            if match:
-                encoding = match.group(1).decode("ascii")
-
-            # If we don't find a match, we'll look at the HTTP Content-Type, and
-            # if that doesn't exist, we'll fall back to UTF-8.
-            if not encoding:
-                content_match = _content_type_match.match(media_info["media_type"])
-                encoding = content_match.group(1) if content_match else "utf-8"
-
+            encoding = get_html_media_encoding(body, media_info["media_type"])
             og = decode_and_calc_og(body, media_info["uri"], encoding)
 
             # pre-cache the image for posterity
@@ -689,6 +672,38 @@ async def _expire_url_cache_data(self) -> None:
             logger.debug("No media removed from url cache")
 
 
+def get_html_media_encoding(body: bytes, content_type: str) -> str:
+    """
+    Get the encoding of the body based on the (presumably) HTML body or media_type.
+
+    If a valid character encoding cannot be found, fallback to UTF-8.
+
+    Args:
+        body: The HTML document, as bytes.
+        content_type: The Content-Type header.
+
+    Returns:
+        The character encoding of the body, as a string.
+    """
+    # Let's try and figure out if it has an encoding set in a meta tag.
+    # Limit it to the first 1kb, since it ought to be in the meta tags
+    # at the top.
+    match = _charset_match.search(body[:1000])
+
+    # If we find a match, it should take precedence over the
+    # Content-Type header, so set it here.
+    if match:
+        return match.group(1).decode("ascii")
+
+    # If we don't find a match, we'll look at the HTTP Content-Type, and
+    # if that doesn't exist, we'll fall back to UTF-8.
+    content_match = _content_type_match.match(content_type)
+    if content_match:
+        return content_match.group(1)
+
+    return "utf-8"
+
+
 def decode_and_calc_og(
     body: bytes, media_uri: str, request_encoding: Optional[str] = None
 ) -> Dict[str, Optional[str]]:
diff --git a/tests/test_preview.py b/tests/test_preview.py
index 10559f4e9937..ebc6d8e6cc6b 100644
--- a/tests/test_preview.py
+++ b/tests/test_preview.py
@@ -15,6 +15,7 @@
 
 from synapse.rest.media.v1.preview_url_resource import (
     decode_and_calc_og,
+    get_html_media_encoding,
     summarize_paragraphs,
 )
 
@@ -26,7 +27,7 @@
     lxml = None
 
 
-class PreviewTestCase(unittest.TestCase):
+class SummarizeTestCase(unittest.TestCase):
     if not lxml:
         skip = "url preview feature requires lxml"
 
@@ -144,7 +145,7 @@ def test_small_then_large_summarize(self):
         )
 
 
-class PreviewUrlTestCase(unittest.TestCase):
+class CalcOgTestCase(unittest.TestCase):
     if not lxml:
         skip = "url preview feature requires lxml"
 
@@ -297,3 +298,37 @@ def test_invalid_encoding2(self):
         """
         og = decode_and_calc_og(html, "http://example.com/test.html")
         self.assertEqual(og, {"og:title": "ÿÿ Foo", "og:description": "Some text."})
+
+
+class MediaEncodingTestCase(unittest.TestCase):
+    def test_meta_charset(self):
+        """A character encoding is found via the meta tag."""
+        encoding = get_html_media_encoding(
+            b"""
+        <html>
+        <head>< meta charset = ascii
+        </html>
+        """,
+            "text/html",
+        )
+        self.assertEqual(encoding, "ascii")
+
+    def test_content_type(self):
+        """A character encoding is found via the Content-Type header."""
+        # Test a few variations of the header.
+        headers = (
+            'text/html; charset="ascii";',
+            "text/html;charset=ascii;",
+            'text/html;  charset="ascii"',
+            "text/html; charset=ascii",
+            'text/html; charset="ascii;',
+            'text/html; charset=ascii";',
+        )
+        for header in headers:
+            encoding = get_html_media_encoding(b"", header)
+            self.assertEqual(encoding, "ascii")
+
+    def test_fallback(self):
+        """A character encoding cannot be found in the body or header."""
+        encoding = get_html_media_encoding(b"", "text/html")
+        self.assertEqual(encoding, "utf-8")

From 5c453fe6f531eb28084033ef41cf00da7a3163ac Mon Sep 17 00:00:00 2001
From: Patrick Cloke <patrickc@matrix.org>
Date: Fri, 5 Feb 2021 13:32:44 -0500
Subject: [PATCH 3/5] Support XML encoding attribute and fix minor bugs.

---
 synapse/rest/media/v1/preview_url_resource.py | 25 +++++++----
 tests/test_preview.py                         | 41 ++++++++++++++++++-
 2 files changed, 58 insertions(+), 8 deletions(-)

diff --git a/synapse/rest/media/v1/preview_url_resource.py b/synapse/rest/media/v1/preview_url_resource.py
index 954364ccab4d..7a9ae8d18056 100644
--- a/synapse/rest/media/v1/preview_url_resource.py
+++ b/synapse/rest/media/v1/preview_url_resource.py
@@ -58,7 +58,8 @@
 
 logger = logging.getLogger(__name__)
 
-_charset_match = re.compile(br"<\s*meta[^>]*charset\s*=\s*([a-z0-9-]+)", flags=re.I)
+_charset_match = re.compile(br'<\s*meta[^>]*charset\s*=\s*"?([a-z0-9-]+)"?', flags=re.I)
+_xml_encoding_match = re.compile(br'\s*<\s*\?\s*xml[^>]*encoding="([a-z0-9-]+)"', flags=re.I)
 _content_type_match = re.compile(r'.*; *charset="?(.*?)"?(;|$)', flags=re.I)
 
 OG_TAG_NAME_MAXLEN = 50
@@ -676,7 +677,12 @@ def get_html_media_encoding(body: bytes, content_type: str) -> str:
     """
     Get the encoding of the body based on the (presumably) HTML body or media_type.
 
-    If a valid character encoding cannot be found, fallback to UTF-8.
+    The precedence used for finding a character encoding is:
+
+    1. meta tag with a charset declared.
+    2. The XML document's character encoding attribute.
+    3. The Content-Type header.
+    4. Fallback to UTF-8.
 
     Args:
         body: The HTML document, as bytes.
@@ -685,13 +691,18 @@ def get_html_media_encoding(body: bytes, content_type: str) -> str:
     Returns:
         The character encoding of the body, as a string.
     """
+    # Limit searches to the first 1kb, since it ought to be at the top.
+    body_start = body[:1024]
+
     # Let's try and figure out if it has an encoding set in a meta tag.
-    # Limit it to the first 1kb, since it ought to be in the meta tags
-    # at the top.
-    match = _charset_match.search(body[:1000])
+    match = _charset_match.search(body_start)
+    if match:
+        return match.group(1).decode("ascii")
+
+    # TODO Support <meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>
 
-    # If we find a match, it should take precedence over the
-    # Content-Type header, so set it here.
+    # If we didn't find a match, see if it an XML document with an encoding.
+    match = _xml_encoding_match.match(body_start)
     if match:
         return match.group(1).decode("ascii")
 
diff --git a/tests/test_preview.py b/tests/test_preview.py
index ebc6d8e6cc6b..ea8329991816 100644
--- a/tests/test_preview.py
+++ b/tests/test_preview.py
@@ -306,13 +306,52 @@ def test_meta_charset(self):
         encoding = get_html_media_encoding(
             b"""
         <html>
-        <head>< meta charset = ascii
+        <head><meta charset="ascii">
+        </head>
         </html>
         """,
             "text/html",
         )
         self.assertEqual(encoding, "ascii")
 
+        # A less well-formed version.
+        encoding = get_html_media_encoding(
+            b"""
+        <html>
+        <head>< meta charset = ascii>
+        </head>
+        </html>
+        """,
+            "text/html",
+        )
+        self.assertEqual(encoding, "ascii")
+
+    def test_xml_encoding(self):
+        """A character encoding is found via the meta tag."""
+        encoding = get_html_media_encoding(
+            b"""
+        <?xml version="1.0" encoding="ascii"?>
+        <html>
+        </html>
+        """,
+            "text/html",
+        )
+        self.assertEqual(encoding, "ascii")
+
+    def test_meta_xml_encoding(self):
+        """Meta tags take precedence over XML encoding."""
+        encoding = get_html_media_encoding(
+            b"""
+        <?xml version="1.0" encoding="ascii"?>
+        <html>
+        <head><meta charset="UTF-16">
+        </head>
+        </html>
+        """,
+            "text/html",
+        )
+        self.assertEqual(encoding, "UTF-16")
+
     def test_content_type(self):
         """A character encoding is found via the Content-Type header."""
         # Test a few variations of the header.

From 5870c94c0e542de9057ad18c43aad7c801bd1d19 Mon Sep 17 00:00:00 2001
From: Patrick Cloke <patrickc@matrix.org>
Date: Fri, 5 Feb 2021 14:24:21 -0500
Subject: [PATCH 4/5] Newsfragment

---
 changelog.d/9333.bugfix | 1 +
 1 file changed, 1 insertion(+)
 create mode 100644 changelog.d/9333.bugfix

diff --git a/changelog.d/9333.bugfix b/changelog.d/9333.bugfix
new file mode 100644
index 000000000000..c34ba378c51b
--- /dev/null
+++ b/changelog.d/9333.bugfix
@@ -0,0 +1 @@
+Fix additional errors when previewing URLs: "AttributeError 'NoneType' object has no attribute 'xpath'" and "ValueError: Unicode strings with encoding declaration are not supported. Please use bytes input or XML fragments without declaration.".

From 4c961295808085a05283bbfab29f6ced90cded23 Mon Sep 17 00:00:00 2001
From: Patrick Cloke <patrickc@matrix.org>
Date: Fri, 5 Feb 2021 14:35:55 -0500
Subject: [PATCH 5/5] Lint.

---
 synapse/rest/media/v1/preview_url_resource.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/synapse/rest/media/v1/preview_url_resource.py b/synapse/rest/media/v1/preview_url_resource.py
index 7a9ae8d18056..ae53b1d23ff7 100644
--- a/synapse/rest/media/v1/preview_url_resource.py
+++ b/synapse/rest/media/v1/preview_url_resource.py
@@ -59,7 +59,9 @@
 logger = logging.getLogger(__name__)
 
 _charset_match = re.compile(br'<\s*meta[^>]*charset\s*=\s*"?([a-z0-9-]+)"?', flags=re.I)
-_xml_encoding_match = re.compile(br'\s*<\s*\?\s*xml[^>]*encoding="([a-z0-9-]+)"', flags=re.I)
+_xml_encoding_match = re.compile(
+    br'\s*<\s*\?\s*xml[^>]*encoding="([a-z0-9-]+)"', flags=re.I
+)
 _content_type_match = re.compile(r'.*; *charset="?(.*?)"?(;|$)', flags=re.I)
 
 OG_TAG_NAME_MAXLEN = 50