diff --git a/changelog.d/9333.bugfix b/changelog.d/9333.bugfix new file mode 100644 index 000000000000..c34ba378c51b --- /dev/null +++ b/changelog.d/9333.bugfix @@ -0,0 +1 @@ +Fix additional errors when previewing URLs: "AttributeError 'NoneType' object has no attribute 'xpath'" and "ValueError: Unicode strings with encoding declaration are not supported. Please use bytes input or XML fragments without declaration.". diff --git a/synapse/rest/media/v1/preview_url_resource.py b/synapse/rest/media/v1/preview_url_resource.py index bf3be653aa6c..ae53b1d23ff7 100644 --- a/synapse/rest/media/v1/preview_url_resource.py +++ b/synapse/rest/media/v1/preview_url_resource.py @@ -58,7 +58,10 @@ logger = logging.getLogger(__name__) -_charset_match = re.compile(br"<\s*meta[^>]*charset\s*=\s*([a-z0-9-]+)", flags=re.I) +_charset_match = re.compile(br'<\s*meta[^>]*charset\s*=\s*"?([a-z0-9-]+)"?', flags=re.I) +_xml_encoding_match = re.compile( + br'\s*<\s*\?\s*xml[^>]*encoding="([a-z0-9-]+)"', flags=re.I +) _content_type_match = re.compile(r'.*; *charset="?(.*?)"?(;|$)', flags=re.I) OG_TAG_NAME_MAXLEN = 50 @@ -300,24 +303,7 @@ async def _do_preview(self, url: str, user: str, ts: int) -> bytes: with open(media_info["filename"], "rb") as file: body = file.read() - encoding = None - - # Let's try and figure out if it has an encoding set in a meta tag. - # Limit it to the first 1kb, since it ought to be in the meta tags - # at the top. - match = _charset_match.search(body[:1000]) - - # If we find a match, it should take precedence over the - # Content-Type header, so set it here. - if match: - encoding = match.group(1).decode("ascii") - - # If we don't find a match, we'll look at the HTTP Content-Type, and - # if that doesn't exist, we'll fall back to UTF-8. - if not encoding: - content_match = _content_type_match.match(media_info["media_type"]) - encoding = content_match.group(1) if content_match else "utf-8" - + encoding = get_html_media_encoding(body, media_info["media_type"]) og = decode_and_calc_og(body, media_info["uri"], encoding) # pre-cache the image for posterity @@ -689,6 +675,48 @@ async def _expire_url_cache_data(self) -> None: logger.debug("No media removed from url cache") +def get_html_media_encoding(body: bytes, content_type: str) -> str: + """ + Get the encoding of the body based on the (presumably) HTML body or media_type. + + The precedence used for finding a character encoding is: + + 1. meta tag with a charset declared. + 2. The XML document's character encoding attribute. + 3. The Content-Type header. + 4. Fallback to UTF-8. + + Args: + body: The HTML document, as bytes. + content_type: The Content-Type header. + + Returns: + The character encoding of the body, as a string. + """ + # Limit searches to the first 1kb, since it ought to be at the top. + body_start = body[:1024] + + # Let's try and figure out if it has an encoding set in a meta tag. + match = _charset_match.search(body_start) + if match: + return match.group(1).decode("ascii") + + # TODO Support + + # If we didn't find a match, see if it an XML document with an encoding. + match = _xml_encoding_match.match(body_start) + if match: + return match.group(1).decode("ascii") + + # If we don't find a match, we'll look at the HTTP Content-Type, and + # if that doesn't exist, we'll fall back to UTF-8. + content_match = _content_type_match.match(content_type) + if content_match: + return content_match.group(1) + + return "utf-8" + + def decode_and_calc_og( body: bytes, media_uri: str, request_encoding: Optional[str] = None ) -> Dict[str, Optional[str]]: @@ -725,6 +753,11 @@ def decode_and_calc_og( def _attempt_calc_og(body_attempt: Union[bytes, str]) -> Dict[str, Optional[str]]: # Attempt to parse the body. If this fails, log and return no metadata. tree = etree.fromstring(body_attempt, parser) + + # The data was successfully parsed, but no tree was found. + if tree is None: + return {} + return _calc_og(tree, media_uri) # Attempt to parse the body. If this fails, log and return no metadata. diff --git a/tests/test_preview.py b/tests/test_preview.py index 0c6cbbd92148..ea8329991816 100644 --- a/tests/test_preview.py +++ b/tests/test_preview.py @@ -15,6 +15,7 @@ from synapse.rest.media.v1.preview_url_resource import ( decode_and_calc_og, + get_html_media_encoding, summarize_paragraphs, ) @@ -26,7 +27,7 @@ lxml = None -class PreviewTestCase(unittest.TestCase): +class SummarizeTestCase(unittest.TestCase): if not lxml: skip = "url preview feature requires lxml" @@ -144,12 +145,12 @@ def test_small_then_large_summarize(self): ) -class PreviewUrlTestCase(unittest.TestCase): +class CalcOgTestCase(unittest.TestCase): if not lxml: skip = "url preview feature requires lxml" def test_simple(self): - html = """ + html = b"""