Skip to content
This repository has been archived by the owner on Apr 26, 2024. It is now read-only.

Handle additional errors when previewing URLs #9333

Merged
merged 5 commits into from
Feb 8, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions changelog.d/9333.bugfix
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Fix additional errors when previewing URLs: "AttributeError 'NoneType' object has no attribute 'xpath'" and "ValueError: Unicode strings with encoding declaration are not supported. Please use bytes input or XML fragments without declaration.".
71 changes: 52 additions & 19 deletions synapse/rest/media/v1/preview_url_resource.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,10 @@

logger = logging.getLogger(__name__)

_charset_match = re.compile(br"<\s*meta[^>]*charset\s*=\s*([a-z0-9-]+)", flags=re.I)
_charset_match = re.compile(br'<\s*meta[^>]*charset\s*=\s*"?([a-z0-9-]+)"?', flags=re.I)
_xml_encoding_match = re.compile(
br'\s*<\s*\?\s*xml[^>]*encoding="([a-z0-9-]+)"', flags=re.I
)
_content_type_match = re.compile(r'.*; *charset="?(.*?)"?(;|$)', flags=re.I)

OG_TAG_NAME_MAXLEN = 50
Expand Down Expand Up @@ -300,24 +303,7 @@ async def _do_preview(self, url: str, user: str, ts: int) -> bytes:
with open(media_info["filename"], "rb") as file:
body = file.read()

encoding = None

# Let's try and figure out if it has an encoding set in a meta tag.
# Limit it to the first 1kb, since it ought to be in the meta tags
# at the top.
match = _charset_match.search(body[:1000])

# If we find a match, it should take precedence over the
# Content-Type header, so set it here.
if match:
encoding = match.group(1).decode("ascii")

# If we don't find a match, we'll look at the HTTP Content-Type, and
# if that doesn't exist, we'll fall back to UTF-8.
if not encoding:
content_match = _content_type_match.match(media_info["media_type"])
encoding = content_match.group(1) if content_match else "utf-8"

encoding = get_html_media_encoding(body, media_info["media_type"])
og = decode_and_calc_og(body, media_info["uri"], encoding)

# pre-cache the image for posterity
Expand Down Expand Up @@ -689,6 +675,48 @@ async def _expire_url_cache_data(self) -> None:
logger.debug("No media removed from url cache")


def get_html_media_encoding(body: bytes, content_type: str) -> str:
"""
Get the encoding of the body based on the (presumably) HTML body or media_type.

The precedence used for finding a character encoding is:

1. meta tag with a charset declared.
2. The XML document's character encoding attribute.
3. The Content-Type header.
4. Fallback to UTF-8.

Args:
body: The HTML document, as bytes.
content_type: The Content-Type header.

Returns:
The character encoding of the body, as a string.
"""
# Limit searches to the first 1kb, since it ought to be at the top.
body_start = body[:1024]

# Let's try and figure out if it has an encoding set in a meta tag.
match = _charset_match.search(body_start)
if match:
return match.group(1).decode("ascii")

# TODO Support <meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>

# If we didn't find a match, see if it an XML document with an encoding.
match = _xml_encoding_match.match(body_start)
if match:
return match.group(1).decode("ascii")

# If we don't find a match, we'll look at the HTTP Content-Type, and
# if that doesn't exist, we'll fall back to UTF-8.
content_match = _content_type_match.match(content_type)
if content_match:
return content_match.group(1)

return "utf-8"


def decode_and_calc_og(
body: bytes, media_uri: str, request_encoding: Optional[str] = None
) -> Dict[str, Optional[str]]:
Expand Down Expand Up @@ -725,6 +753,11 @@ def decode_and_calc_og(
def _attempt_calc_og(body_attempt: Union[bytes, str]) -> Dict[str, Optional[str]]:
# Attempt to parse the body. If this fails, log and return no metadata.
tree = etree.fromstring(body_attempt, parser)

# The data was successfully parsed, but no tree was found.
if tree is None:
return {}

return _calc_og(tree, media_uri)

# Attempt to parse the body. If this fails, log and return no metadata.
Expand Down
103 changes: 92 additions & 11 deletions tests/test_preview.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@

from synapse.rest.media.v1.preview_url_resource import (
decode_and_calc_og,
get_html_media_encoding,
summarize_paragraphs,
)

Expand All @@ -26,7 +27,7 @@
lxml = None


class PreviewTestCase(unittest.TestCase):
class SummarizeTestCase(unittest.TestCase):
if not lxml:
skip = "url preview feature requires lxml"

Expand Down Expand Up @@ -144,12 +145,12 @@ def test_small_then_large_summarize(self):
)


class PreviewUrlTestCase(unittest.TestCase):
class CalcOgTestCase(unittest.TestCase):
if not lxml:
skip = "url preview feature requires lxml"

def test_simple(self):
html = """
html = b"""
<html>
<head><title>Foo</title></head>
<body>
Expand All @@ -163,7 +164,7 @@ def test_simple(self):
self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."})

def test_comment(self):
html = """
html = b"""
<html>
<head><title>Foo</title></head>
<body>
Expand All @@ -178,7 +179,7 @@ def test_comment(self):
self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."})

def test_comment2(self):
html = """
html = b"""
<html>
<head><title>Foo</title></head>
<body>
Expand All @@ -202,7 +203,7 @@ def test_comment2(self):
)

def test_script(self):
html = """
html = b"""
<html>
<head><title>Foo</title></head>
<body>
Expand All @@ -217,7 +218,7 @@ def test_script(self):
self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."})

def test_missing_title(self):
html = """
html = b"""
<html>
<body>
Some text.
Expand All @@ -230,7 +231,7 @@ def test_missing_title(self):
self.assertEqual(og, {"og:title": None, "og:description": "Some text."})

def test_h1_as_title(self):
html = """
html = b"""
<html>
<meta property="og:description" content="Some text."/>
<body>
Expand All @@ -244,7 +245,7 @@ def test_h1_as_title(self):
self.assertEqual(og, {"og:title": "Title", "og:description": "Some text."})

def test_missing_title_and_broken_h1(self):
html = """
html = b"""
<html>
<body>
<h1><a href="foo"/></h1>
Expand All @@ -258,13 +259,20 @@ def test_missing_title_and_broken_h1(self):
self.assertEqual(og, {"og:title": None, "og:description": "Some text."})

def test_empty(self):
html = ""
"""Test a body with no data in it."""
html = b""
og = decode_and_calc_og(html, "http://example.com/test.html")
self.assertEqual(og, {})

def test_no_tree(self):
"""A valid body with no tree in it."""
html = b"\x00"
og = decode_and_calc_og(html, "http://example.com/test.html")
self.assertEqual(og, {})

def test_invalid_encoding(self):
"""An invalid character encoding should be ignored and treated as UTF-8, if possible."""
html = """
html = b"""
<html>
<head><title>Foo</title></head>
<body>
Expand All @@ -290,3 +298,76 @@ def test_invalid_encoding2(self):
"""
og = decode_and_calc_og(html, "http://example.com/test.html")
self.assertEqual(og, {"og:title": "ÿÿ Foo", "og:description": "Some text."})


class MediaEncodingTestCase(unittest.TestCase):
def test_meta_charset(self):
"""A character encoding is found via the meta tag."""
encoding = get_html_media_encoding(
b"""
<html>
<head><meta charset="ascii">
</head>
</html>
""",
"text/html",
)
self.assertEqual(encoding, "ascii")

# A less well-formed version.
encoding = get_html_media_encoding(
b"""
<html>
<head>< meta charset = ascii>
</head>
</html>
""",
"text/html",
)
self.assertEqual(encoding, "ascii")

def test_xml_encoding(self):
"""A character encoding is found via the meta tag."""
encoding = get_html_media_encoding(
b"""
<?xml version="1.0" encoding="ascii"?>
<html>
</html>
""",
"text/html",
)
self.assertEqual(encoding, "ascii")

def test_meta_xml_encoding(self):
"""Meta tags take precedence over XML encoding."""
encoding = get_html_media_encoding(
b"""
<?xml version="1.0" encoding="ascii"?>
<html>
<head><meta charset="UTF-16">
</head>
</html>
""",
"text/html",
)
self.assertEqual(encoding, "UTF-16")

def test_content_type(self):
"""A character encoding is found via the Content-Type header."""
# Test a few variations of the header.
headers = (
'text/html; charset="ascii";',
"text/html;charset=ascii;",
'text/html; charset="ascii"',
"text/html; charset=ascii",
'text/html; charset="ascii;',
'text/html; charset=ascii";',
)
for header in headers:
encoding = get_html_media_encoding(b"", header)
self.assertEqual(encoding, "ascii")

def test_fallback(self):
"""A character encoding cannot be found in the body or header."""
encoding = get_html_media_encoding(b"", "text/html")
self.assertEqual(encoding, "utf-8")