Skip to content
This repository has been archived by the owner on Apr 26, 2024. It is now read-only.

Fix URL preview errors when previewing XML documents #11196

Merged
merged 3 commits into from
Oct 27, 2021
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions changelog.d/11196.bugfix
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Fix a bug introduced in v1.46.0rc1 where URL previews of some XML documents would fail.
4 changes: 2 additions & 2 deletions synapse/rest/media/v1/preview_url_resource.py
Original file line number Diff line number Diff line change
Expand Up @@ -732,11 +732,11 @@ def decode_body(
from lxml import etree

# Create an HTML parser.
parser = etree.HTMLParser(recover=True, encoding="utf-8")
parser = etree.HTMLParser(recover=True, encoding=encoding)

# Attempt to parse the body. Returns None if the body was successfully
# parsed, but no tree was found.
return etree.fromstring(body_str, parser)
return etree.fromstring(body, parser)


def _calc_og(tree: "etree.Element", media_uri: str) -> Dict[str, Optional[str]]:
Expand Down
15 changes: 15 additions & 0 deletions tests/test_preview.py
Original file line number Diff line number Diff line change
Expand Up @@ -277,6 +277,21 @@ def test_no_tree(self):
tree = decode_body(html, "http://example.com/test.html")
self.assertIsNone(tree)

def test_xml(self):
"""Test decoding XML and ensure it works properly."""
# Note that the strip() call is important to ensure the xml tag starts
# at the initial byte.
html = b"""
<?xml version="1.0" encoding="UTF-8"?>

<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
<head><title>Foo</title></head><body>Some text.</body></html>
""".strip()
tree = decode_body(html, "http://example.com/test.html")
og = _calc_og(tree, "http://example.com/test.html")
self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."})

def test_invalid_encoding(self):
"""An invalid character encoding should be ignored and treated as UTF-8, if possible."""
html = b"""
Expand Down