From 230a2cc6d7dc701afb801429eb1db69a5410a13f Mon Sep 17 00:00:00 2001 From: Patrick Cloke Date: Fri, 5 Feb 2021 12:44:44 -0500 Subject: [PATCH 1/5] Handle lxml not finding an XML tree. --- synapse/rest/media/v1/preview_url_resource.py | 5 ++++ tests/test_preview.py | 25 ++++++++++++------- 2 files changed, 21 insertions(+), 9 deletions(-) diff --git a/synapse/rest/media/v1/preview_url_resource.py b/synapse/rest/media/v1/preview_url_resource.py index bf3be653aa6c..16d16890a487 100644 --- a/synapse/rest/media/v1/preview_url_resource.py +++ b/synapse/rest/media/v1/preview_url_resource.py @@ -725,6 +725,11 @@ def decode_and_calc_og( def _attempt_calc_og(body_attempt: Union[bytes, str]) -> Dict[str, Optional[str]]: # Attempt to parse the body. If this fails, log and return no metadata. tree = etree.fromstring(body_attempt, parser) + + # The data was successfully parsed, but no tree was found. + if tree is None: + return {} + return _calc_og(tree, media_uri) # Attempt to parse the body. If this fails, log and return no metadata. diff --git a/tests/test_preview.py b/tests/test_preview.py index 0c6cbbd92148..10559f4e9937 100644 --- a/tests/test_preview.py +++ b/tests/test_preview.py @@ -149,7 +149,7 @@ class PreviewUrlTestCase(unittest.TestCase): skip = "url preview feature requires lxml" def test_simple(self): - html = """ + html = b""" Foo @@ -163,7 +163,7 @@ def test_simple(self): self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."}) def test_comment(self): - html = """ + html = b""" Foo @@ -178,7 +178,7 @@ def test_comment(self): self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."}) def test_comment2(self): - html = """ + html = b""" Foo @@ -202,7 +202,7 @@ def test_comment2(self): ) def test_script(self): - html = """ + html = b""" Foo @@ -217,7 +217,7 @@ def test_script(self): self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."}) def test_missing_title(self): - html = """ + html = b""" Some text. @@ -230,7 +230,7 @@ def test_missing_title(self): self.assertEqual(og, {"og:title": None, "og:description": "Some text."}) def test_h1_as_title(self): - html = """ + html = b""" @@ -244,7 +244,7 @@ def test_h1_as_title(self): self.assertEqual(og, {"og:title": "Title", "og:description": "Some text."}) def test_missing_title_and_broken_h1(self): - html = """ + html = b"""

@@ -258,13 +258,20 @@ def test_missing_title_and_broken_h1(self): self.assertEqual(og, {"og:title": None, "og:description": "Some text."}) def test_empty(self): - html = "" + """Test a body with no data in it.""" + html = b"" + og = decode_and_calc_og(html, "http://example.com/test.html") + self.assertEqual(og, {}) + + def test_no_tree(self): + """A valid body with no tree in it.""" + html = b"\x00" og = decode_and_calc_og(html, "http://example.com/test.html") self.assertEqual(og, {}) def test_invalid_encoding(self): """An invalid character encoding should be ignored and treated as UTF-8, if possible.""" - html = """ + html = b""" Foo From 9c9602402fe7832686f72cf1b75f0af35458045c Mon Sep 17 00:00:00 2001 From: Patrick Cloke Date: Fri, 5 Feb 2021 13:17:22 -0500 Subject: [PATCH 2/5] Split out code for getting charsets and add tests. --- synapse/rest/media/v1/preview_url_resource.py | 51 ++++++++++++------- tests/test_preview.py | 39 +++++++++++++- 2 files changed, 70 insertions(+), 20 deletions(-) diff --git a/synapse/rest/media/v1/preview_url_resource.py b/synapse/rest/media/v1/preview_url_resource.py index 16d16890a487..954364ccab4d 100644 --- a/synapse/rest/media/v1/preview_url_resource.py +++ b/synapse/rest/media/v1/preview_url_resource.py @@ -300,24 +300,7 @@ async def _do_preview(self, url: str, user: str, ts: int) -> bytes: with open(media_info["filename"], "rb") as file: body = file.read() - encoding = None - - # Let's try and figure out if it has an encoding set in a meta tag. - # Limit it to the first 1kb, since it ought to be in the meta tags - # at the top. - match = _charset_match.search(body[:1000]) - - # If we find a match, it should take precedence over the - # Content-Type header, so set it here. - if match: - encoding = match.group(1).decode("ascii") - - # If we don't find a match, we'll look at the HTTP Content-Type, and - # if that doesn't exist, we'll fall back to UTF-8. - if not encoding: - content_match = _content_type_match.match(media_info["media_type"]) - encoding = content_match.group(1) if content_match else "utf-8" - + encoding = get_html_media_encoding(body, media_info["media_type"]) og = decode_and_calc_og(body, media_info["uri"], encoding) # pre-cache the image for posterity @@ -689,6 +672,38 @@ async def _expire_url_cache_data(self) -> None: logger.debug("No media removed from url cache") +def get_html_media_encoding(body: bytes, content_type: str) -> str: + """ + Get the encoding of the body based on the (presumably) HTML body or media_type. + + If a valid character encoding cannot be found, fallback to UTF-8. + + Args: + body: The HTML document, as bytes. + content_type: The Content-Type header. + + Returns: + The character encoding of the body, as a string. + """ + # Let's try and figure out if it has an encoding set in a meta tag. + # Limit it to the first 1kb, since it ought to be in the meta tags + # at the top. + match = _charset_match.search(body[:1000]) + + # If we find a match, it should take precedence over the + # Content-Type header, so set it here. + if match: + return match.group(1).decode("ascii") + + # If we don't find a match, we'll look at the HTTP Content-Type, and + # if that doesn't exist, we'll fall back to UTF-8. + content_match = _content_type_match.match(content_type) + if content_match: + return content_match.group(1) + + return "utf-8" + + def decode_and_calc_og( body: bytes, media_uri: str, request_encoding: Optional[str] = None ) -> Dict[str, Optional[str]]: diff --git a/tests/test_preview.py b/tests/test_preview.py index 10559f4e9937..ebc6d8e6cc6b 100644 --- a/tests/test_preview.py +++ b/tests/test_preview.py @@ -15,6 +15,7 @@ from synapse.rest.media.v1.preview_url_resource import ( decode_and_calc_og, + get_html_media_encoding, summarize_paragraphs, ) @@ -26,7 +27,7 @@ lxml = None -class PreviewTestCase(unittest.TestCase): +class SummarizeTestCase(unittest.TestCase): if not lxml: skip = "url preview feature requires lxml" @@ -144,7 +145,7 @@ def test_small_then_large_summarize(self): ) -class PreviewUrlTestCase(unittest.TestCase): +class CalcOgTestCase(unittest.TestCase): if not lxml: skip = "url preview feature requires lxml" @@ -297,3 +298,37 @@ def test_invalid_encoding2(self): """ og = decode_and_calc_og(html, "http://example.com/test.html") self.assertEqual(og, {"og:title": "ÿÿ Foo", "og:description": "Some text."}) + + +class MediaEncodingTestCase(unittest.TestCase): + def test_meta_charset(self): + """A character encoding is found via the meta tag.""" + encoding = get_html_media_encoding( + b""" + + < meta charset = ascii + + """, + "text/html", + ) + self.assertEqual(encoding, "ascii") + + def test_content_type(self): + """A character encoding is found via the Content-Type header.""" + # Test a few variations of the header. + headers = ( + 'text/html; charset="ascii";', + "text/html;charset=ascii;", + 'text/html; charset="ascii"', + "text/html; charset=ascii", + 'text/html; charset="ascii;', + 'text/html; charset=ascii";', + ) + for header in headers: + encoding = get_html_media_encoding(b"", header) + self.assertEqual(encoding, "ascii") + + def test_fallback(self): + """A character encoding cannot be found in the body or header.""" + encoding = get_html_media_encoding(b"", "text/html") + self.assertEqual(encoding, "utf-8") From 5c453fe6f531eb28084033ef41cf00da7a3163ac Mon Sep 17 00:00:00 2001 From: Patrick Cloke Date: Fri, 5 Feb 2021 13:32:44 -0500 Subject: [PATCH 3/5] Support XML encoding attribute and fix minor bugs. --- synapse/rest/media/v1/preview_url_resource.py | 25 +++++++---- tests/test_preview.py | 41 ++++++++++++++++++- 2 files changed, 58 insertions(+), 8 deletions(-) diff --git a/synapse/rest/media/v1/preview_url_resource.py b/synapse/rest/media/v1/preview_url_resource.py index 954364ccab4d..7a9ae8d18056 100644 --- a/synapse/rest/media/v1/preview_url_resource.py +++ b/synapse/rest/media/v1/preview_url_resource.py @@ -58,7 +58,8 @@ logger = logging.getLogger(__name__) -_charset_match = re.compile(br"<\s*meta[^>]*charset\s*=\s*([a-z0-9-]+)", flags=re.I) +_charset_match = re.compile(br'<\s*meta[^>]*charset\s*=\s*"?([a-z0-9-]+)"?', flags=re.I) +_xml_encoding_match = re.compile(br'\s*<\s*\?\s*xml[^>]*encoding="([a-z0-9-]+)"', flags=re.I) _content_type_match = re.compile(r'.*; *charset="?(.*?)"?(;|$)', flags=re.I) OG_TAG_NAME_MAXLEN = 50 @@ -676,7 +677,12 @@ def get_html_media_encoding(body: bytes, content_type: str) -> str: """ Get the encoding of the body based on the (presumably) HTML body or media_type. - If a valid character encoding cannot be found, fallback to UTF-8. + The precedence used for finding a character encoding is: + + 1. meta tag with a charset declared. + 2. The XML document's character encoding attribute. + 3. The Content-Type header. + 4. Fallback to UTF-8. Args: body: The HTML document, as bytes. @@ -685,13 +691,18 @@ def get_html_media_encoding(body: bytes, content_type: str) -> str: Returns: The character encoding of the body, as a string. """ + # Limit searches to the first 1kb, since it ought to be at the top. + body_start = body[:1024] + # Let's try and figure out if it has an encoding set in a meta tag. - # Limit it to the first 1kb, since it ought to be in the meta tags - # at the top. - match = _charset_match.search(body[:1000]) + match = _charset_match.search(body_start) + if match: + return match.group(1).decode("ascii") + + # TODO Support - # If we find a match, it should take precedence over the - # Content-Type header, so set it here. + # If we didn't find a match, see if it an XML document with an encoding. + match = _xml_encoding_match.match(body_start) if match: return match.group(1).decode("ascii") diff --git a/tests/test_preview.py b/tests/test_preview.py index ebc6d8e6cc6b..ea8329991816 100644 --- a/tests/test_preview.py +++ b/tests/test_preview.py @@ -306,13 +306,52 @@ def test_meta_charset(self): encoding = get_html_media_encoding( b""" - < meta charset = ascii + + """, "text/html", ) self.assertEqual(encoding, "ascii") + # A less well-formed version. + encoding = get_html_media_encoding( + b""" + + < meta charset = ascii> + + + """, + "text/html", + ) + self.assertEqual(encoding, "ascii") + + def test_xml_encoding(self): + """A character encoding is found via the meta tag.""" + encoding = get_html_media_encoding( + b""" + + + + """, + "text/html", + ) + self.assertEqual(encoding, "ascii") + + def test_meta_xml_encoding(self): + """Meta tags take precedence over XML encoding.""" + encoding = get_html_media_encoding( + b""" + + + + + + """, + "text/html", + ) + self.assertEqual(encoding, "UTF-16") + def test_content_type(self): """A character encoding is found via the Content-Type header.""" # Test a few variations of the header. From 5870c94c0e542de9057ad18c43aad7c801bd1d19 Mon Sep 17 00:00:00 2001 From: Patrick Cloke Date: Fri, 5 Feb 2021 14:24:21 -0500 Subject: [PATCH 4/5] Newsfragment --- changelog.d/9333.bugfix | 1 + 1 file changed, 1 insertion(+) create mode 100644 changelog.d/9333.bugfix diff --git a/changelog.d/9333.bugfix b/changelog.d/9333.bugfix new file mode 100644 index 000000000000..c34ba378c51b --- /dev/null +++ b/changelog.d/9333.bugfix @@ -0,0 +1 @@ +Fix additional errors when previewing URLs: "AttributeError 'NoneType' object has no attribute 'xpath'" and "ValueError: Unicode strings with encoding declaration are not supported. Please use bytes input or XML fragments without declaration.". From 4c961295808085a05283bbfab29f6ced90cded23 Mon Sep 17 00:00:00 2001 From: Patrick Cloke Date: Fri, 5 Feb 2021 14:35:55 -0500 Subject: [PATCH 5/5] Lint. --- synapse/rest/media/v1/preview_url_resource.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/synapse/rest/media/v1/preview_url_resource.py b/synapse/rest/media/v1/preview_url_resource.py index 7a9ae8d18056..ae53b1d23ff7 100644 --- a/synapse/rest/media/v1/preview_url_resource.py +++ b/synapse/rest/media/v1/preview_url_resource.py @@ -59,7 +59,9 @@ logger = logging.getLogger(__name__) _charset_match = re.compile(br'<\s*meta[^>]*charset\s*=\s*"?([a-z0-9-]+)"?', flags=re.I) -_xml_encoding_match = re.compile(br'\s*<\s*\?\s*xml[^>]*encoding="([a-z0-9-]+)"', flags=re.I) +_xml_encoding_match = re.compile( + br'\s*<\s*\?\s*xml[^>]*encoding="([a-z0-9-]+)"', flags=re.I +) _content_type_match = re.compile(r'.*; *charset="?(.*?)"?(;|$)', flags=re.I) OG_TAG_NAME_MAXLEN = 50