Skip to content
This repository has been archived by the owner on Apr 26, 2024. It is now read-only.

Attempt different character encodings when previewing a URL. #11077

Merged
merged 3 commits into from
Oct 14, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions changelog.d/11077.bugfix
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Fix a long-standing bug when attempting to preview URLs which are in the `windows-1252` character encoding.
80 changes: 39 additions & 41 deletions synapse/rest/media/v1/preview_url_resource.py
Original file line number Diff line number Diff line change
Expand Up @@ -295,8 +295,7 @@ async def _do_preview(self, url: str, user: str, ts: int) -> bytes:
with open(media_info.filename, "rb") as file:
body = file.read()

encoding = get_html_media_encoding(body, media_info.media_type)
tree = decode_body(body, encoding)
tree = decode_body(body, media_info.uri, media_info.media_type)
if tree is not None:
# Check if this HTML document points to oEmbed information and
# defer to that.
Expand Down Expand Up @@ -632,16 +631,19 @@ def try_remove_parent_dirs(dirs: Iterable[str]) -> None:
logger.debug("No media removed from url cache")


def get_html_media_encoding(body: bytes, content_type: str) -> str:
def get_html_media_encodings(body: bytes, content_type: Optional[str]) -> Iterable[str]:
"""
Get the encoding of the body based on the (presumably) HTML body or media_type.
Get potential encoding of the body based on the (presumably) HTML body or the content-type header.

The precedence used for finding a character encoding is:

1. meta tag with a charset declared.
1. <meta> tag with a charset declared.
2. The XML document's character encoding attribute.
3. The Content-Type header.
4. Fallback to UTF-8.
4. Fallback to utf-8.
5. Fallback to windows-1252.

This roughly follows the algorithm used by BeautifulSoup's bs4.dammit.EncodingDetector.

Args:
body: The HTML document, as bytes.
Expand All @@ -653,36 +655,39 @@ def get_html_media_encoding(body: bytes, content_type: str) -> str:
# Limit searches to the first 1kb, since it ought to be at the top.
body_start = body[:1024]

# Let's try and figure out if it has an encoding set in a meta tag.
# Check if it has an encoding set in a meta tag.
match = _charset_match.search(body_start)
if match:
return match.group(1).decode("ascii")
yield match.group(1).decode("ascii")

# TODO Support <meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>

# If we didn't find a match, see if it an XML document with an encoding.
# Check if it has an XML document with an encoding.
match = _xml_encoding_match.match(body_start)
if match:
return match.group(1).decode("ascii")
yield match.group(1).decode("ascii")

# If we don't find a match, we'll look at the HTTP Content-Type, and
# if that doesn't exist, we'll fall back to UTF-8.
content_match = _content_type_match.match(content_type)
if content_match:
return content_match.group(1)
# Check the HTTP Content-Type header for a character set.
if content_type:
content_match = _content_type_match.match(content_type)
if content_match:
yield content_match.group(1)

return "utf-8"
# Finally, fallback to UTF-8, then windows-1252.
yield "utf-8"
yield "windows-1252"
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Note: one easy thing to do to mitigate @squahtx 's concern would be to use a set to track which encodings this code has already yielded and then skip yielding that encoding a second time.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The BeautifulSoup code does this and...I meant to implement it and failed to remember to do that. I'll do a follow-up! Thanks!

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oops, I didn't notice the set at all. Some of my comments might not have made sense since I thought there wasn't one.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

See #11089.



def decode_body(
body: bytes, request_encoding: Optional[str] = None
body: bytes, uri: str, content_type: Optional[str] = None
) -> Optional["etree.Element"]:
"""
This uses lxml to parse the HTML document.

Args:
body: The HTML document, as bytes.
request_encoding: The character encoding of the body, as a string.
uri: The URI used to download the body.
content_type: The Content-Type header.

Returns:
The parsed HTML body, or None if an error occurred during processed.
Expand All @@ -691,32 +696,25 @@ def decode_body(
if not body:
return None

for encoding in get_html_media_encodings(body, content_type):
try:
body_str = body.decode(encoding)
except Exception:
pass
else:
break
else:
logger.warning("Unable to decode HTML body for %s", uri)
return None
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just a thought... would there be value in falling back to latin-1 (iso8859-1) after trying windows-1252? The advantage would be that all bytes will decode in latin-1 and so you will always get a resulting string.

The drawback is that if the content wasn't really encoded with latin-1, then you're just returning noise. (However, the same issue applies to using windows-1252 as a fallback, it's just that windows-1252 has five invalid bytes on which decoding could fail whereas latin-1 has zero invalid bytes)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That could help, I think worse case you'd get some garbage characters, but hopefully some of the preview would still be useful.


from lxml import etree

# Create an HTML parser. If this fails, log and return no metadata.
try:
parser = etree.HTMLParser(recover=True, encoding=request_encoding)
except LookupError:
# blindly consider the encoding as utf-8.
parser = etree.HTMLParser(recover=True, encoding="utf-8")
except Exception as e:
logger.warning("Unable to create HTML parser: %s" % (e,))
return None
# Create an HTML parser.
parser = etree.HTMLParser(recover=True, encoding="utf-8")

def _attempt_decode_body(
body_attempt: Union[bytes, str]
) -> Optional["etree.Element"]:
# Attempt to parse the body. Returns None if the body was successfully
# parsed, but no tree was found.
return etree.fromstring(body_attempt, parser)

# Attempt to parse the body. If this fails, log and return no metadata.
try:
return _attempt_decode_body(body)
except UnicodeDecodeError:
# blindly try decoding the body as utf-8, which seems to fix
# the charset mismatches on https://google.com
return _attempt_decode_body(body.decode("utf-8", "ignore"))
# Attempt to parse the body. Returns None if the body was successfully
# parsed, but no tree was found.
return etree.fromstring(body_str, parser)


def _calc_og(tree: "etree.Element", media_uri: str) -> Dict[str, Optional[str]]:
Expand Down
66 changes: 40 additions & 26 deletions tests/test_preview.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
from synapse.rest.media.v1.preview_url_resource import (
_calc_og,
decode_body,
get_html_media_encoding,
get_html_media_encodings,
summarize_paragraphs,
)

Expand Down Expand Up @@ -159,7 +159,7 @@ def test_simple(self):
</html>
"""

tree = decode_body(html)
tree = decode_body(html, "http://example.com/test.html")
og = _calc_og(tree, "http://example.com/test.html")

self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."})
Expand All @@ -175,7 +175,7 @@ def test_comment(self):
</html>
"""

tree = decode_body(html)
tree = decode_body(html, "http://example.com/test.html")
og = _calc_og(tree, "http://example.com/test.html")

self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."})
Expand All @@ -194,7 +194,7 @@ def test_comment2(self):
</html>
"""

tree = decode_body(html)
tree = decode_body(html, "http://example.com/test.html")
og = _calc_og(tree, "http://example.com/test.html")

self.assertEqual(
Expand All @@ -216,7 +216,7 @@ def test_script(self):
</html>
"""

tree = decode_body(html)
tree = decode_body(html, "http://example.com/test.html")
og = _calc_og(tree, "http://example.com/test.html")

self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."})
Expand All @@ -230,7 +230,7 @@ def test_missing_title(self):
</html>
"""

tree = decode_body(html)
tree = decode_body(html, "http://example.com/test.html")
og = _calc_og(tree, "http://example.com/test.html")

self.assertEqual(og, {"og:title": None, "og:description": "Some text."})
Expand All @@ -245,7 +245,7 @@ def test_h1_as_title(self):
</html>
"""

tree = decode_body(html)
tree = decode_body(html, "http://example.com/test.html")
og = _calc_og(tree, "http://example.com/test.html")

self.assertEqual(og, {"og:title": "Title", "og:description": "Some text."})
Expand All @@ -260,21 +260,21 @@ def test_missing_title_and_broken_h1(self):
</html>
"""

tree = decode_body(html)
tree = decode_body(html, "http://example.com/test.html")
og = _calc_og(tree, "http://example.com/test.html")

self.assertEqual(og, {"og:title": None, "og:description": "Some text."})

def test_empty(self):
"""Test a body with no data in it."""
html = b""
tree = decode_body(html)
tree = decode_body(html, "http://example.com/test.html")
self.assertIsNone(tree)

def test_no_tree(self):
"""A valid body with no tree in it."""
html = b"\x00"
tree = decode_body(html)
tree = decode_body(html, "http://example.com/test.html")
self.assertIsNone(tree)

def test_invalid_encoding(self):
Expand All @@ -287,7 +287,7 @@ def test_invalid_encoding(self):
</body>
</html>
"""
tree = decode_body(html, "invalid-encoding")
tree = decode_body(html, "http://example.com/test.html", "invalid-encoding")
og = _calc_og(tree, "http://example.com/test.html")
self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."})

Expand All @@ -302,15 +302,29 @@ def test_invalid_encoding2(self):
</body>
</html>
"""
tree = decode_body(html)
tree = decode_body(html, "http://example.com/test.html")
og = _calc_og(tree, "http://example.com/test.html")
self.assertEqual(og, {"og:title": "ÿÿ Foo", "og:description": "Some text."})

def test_windows_1252(self):
"""A body which uses windows-1252, but doesn't declare that."""
html = b"""
<html>
<head><title>\xf3</title></head>
<body>
Some text.
</body>
</html>
"""
tree = decode_body(html, "http://example.com/test.html")
og = _calc_og(tree, "http://example.com/test.html")
self.assertEqual(og, {"og:title": "ó", "og:description": "Some text."})


class MediaEncodingTestCase(unittest.TestCase):
def test_meta_charset(self):
"""A character encoding is found via the meta tag."""
encoding = get_html_media_encoding(
encodings = get_html_media_encodings(
b"""
<html>
<head><meta charset="ascii">
Expand All @@ -319,10 +333,10 @@ def test_meta_charset(self):
""",
"text/html",
)
self.assertEqual(encoding, "ascii")
self.assertEqual(list(encodings), ["ascii", "utf-8", "windows-1252"])

# A less well-formed version.
encoding = get_html_media_encoding(
encodings = get_html_media_encodings(
b"""
<html>
<head>< meta charset = ascii>
Expand All @@ -331,11 +345,11 @@ def test_meta_charset(self):
""",
"text/html",
)
self.assertEqual(encoding, "ascii")
self.assertEqual(list(encodings), ["ascii", "utf-8", "windows-1252"])

def test_meta_charset_underscores(self):
"""A character encoding contains underscore."""
encoding = get_html_media_encoding(
encodings = get_html_media_encodings(
b"""
<html>
<head><meta charset="Shift_JIS">
Expand All @@ -344,23 +358,23 @@ def test_meta_charset_underscores(self):
""",
"text/html",
)
self.assertEqual(encoding, "Shift_JIS")
self.assertEqual(list(encodings), ["Shift_JIS", "utf-8", "windows-1252"])

def test_xml_encoding(self):
"""A character encoding is found via the meta tag."""
encoding = get_html_media_encoding(
encodings = get_html_media_encodings(
b"""
<?xml version="1.0" encoding="ascii"?>
<html>
</html>
""",
"text/html",
)
self.assertEqual(encoding, "ascii")
self.assertEqual(list(encodings), ["ascii", "utf-8", "windows-1252"])

def test_meta_xml_encoding(self):
"""Meta tags take precedence over XML encoding."""
encoding = get_html_media_encoding(
encodings = get_html_media_encodings(
b"""
<?xml version="1.0" encoding="ascii"?>
<html>
Expand All @@ -370,7 +384,7 @@ def test_meta_xml_encoding(self):
""",
"text/html",
)
self.assertEqual(encoding, "UTF-16")
self.assertEqual(list(encodings), ["UTF-16", "ascii", "utf-8", "windows-1252"])

def test_content_type(self):
"""A character encoding is found via the Content-Type header."""
Expand All @@ -384,10 +398,10 @@ def test_content_type(self):
'text/html; charset=ascii";',
)
for header in headers:
encoding = get_html_media_encoding(b"", header)
self.assertEqual(encoding, "ascii")
encodings = get_html_media_encodings(b"", header)
self.assertEqual(list(encodings), ["ascii", "utf-8", "windows-1252"])

def test_fallback(self):
"""A character encoding cannot be found in the body or header."""
encoding = get_html_media_encoding(b"", "text/html")
self.assertEqual(encoding, "utf-8")
encodings = get_html_media_encodings(b"", "text/html")
self.assertEqual(list(encodings), ["utf-8", "windows-1252"])