Merge pull request #25 from taku0/fix_clipboard_handling_html_with_mu…

…ltibyte_characters Fix clipboard handling HTML with multibyte characters
jaraco · Sep 21, 2024 · fe92782 · fe92782
2 parents 65cacd7 + 627f8a4
commit fe92782
Show file tree

Hide file tree

Showing 3 changed files with 89 additions and 13 deletions.
diff --git a/jaraco/windows/clipboard.py b/jaraco/windows/clipboard.py
@@ -82,24 +82,60 @@ class HTMLSnippet(object):
     <https://docs.microsoft.com/en-us/windows/win32/dataxchg/html-clipboard-format>`_.
     """
 
+    # The non-normative grammar defines markers with a space before "-->",
+    # but the body text defines them without spaces and states "with no
+    # whitespace chars within each comment itself".
+    # Actually, Edge and Firefox don't include spaces, so we also don't include
+    # a space.
+    START_FRAGMENT_MARKER = b'<!--StartFragment-->'
+    END_FRAGMENT_MARKER = b'<!--EndFragment-->'
+
     def __init__(self, handle):
         self.data = nts(raw_data(handle).decode('utf-8'))
         self.headers = self.parse_headers(self.data)
 
     @property
     def html(self):
-        return self.data[self.headers['StartHTML'] :]
+        start_html = self.headers['StartHTML']
+        end_html = self.headers['EndHTML']
+        if start_html == -1:
+            start_html = self.headers['StartFragment'] - len(
+                HTMLSnippet.START_FRAGMENT_MARKER
+            )
+        if end_html == -1:
+            end_html = self.headers['EndFragment'] + len(
+                HTMLSnippet.END_FRAGMENT_MARKER
+            )
+        return self.__slice_data(start_html, end_html)
 
     @property
     def fragment(self):
-        return self.data[self.headers['StartFragment'] : self.headers['EndFragment']]
+        start_fragment = self.headers['StartFragment']
+        end_fragment = self.headers['EndFragment']
+        return self.__slice_data(start_fragment, end_fragment)
+
+    @property
+    def selection(self):
+        start_selection = (
+            self.headers.get('StartSelection') or self.headers['StartFragment']
+        )
+        end_selection = self.headers.get('EndSelection') or self.headers['EndFragment']
+        return self.__slice_data(start_selection, end_selection)
+
+    def __slice_data(self, start, end):
+        return self.data.encode('utf-8')[start:end].decode('utf-8')
 
     @staticmethod
     def parse_headers(data):
-        d = io.StringIO(data)
+        d = io.StringIO(data, None)
+
+        if not re.match(r'Version:(?:0\.9|1\.0)\n', d.readline()):
+            raise ValueError('Unsupported format')
+
+        d.seek(0)
 
         def header_line(line):
-            return re.match(r'(\w+):(.*)', line)
+            return re.match(r'(\w+):(.*)\n', line)
 
         headers = map(header_line, d)
         # grab headers until they no longer match
@@ -133,20 +169,19 @@ def from_string(cls, source):
             StartFragment:{start_fragment:08d}
             EndFragment:{end_fragment:08d}
             <html><body>
-            <!--StartFragment -->
-            {source}
-            <!--EndFragment -->
+            <!--StartFragment-->{source}<!--EndFragment-->
             </body></html>
             """
         ).strip()
         zeros = collections.defaultdict(lambda: 0, locals())
-        pre_value = tmpl.format_map(zeros)
-        start_html = pre_value.find('<html>')
-        end_html = len(tmpl)
+        pre_value = tmpl.format_map(zeros).encode('utf-8')
+        start_html = pre_value.find(b'<html>')
+        end_html = len(pre_value)
         assert end_html < 100000000
-        start_fragment = pre_value.find(source)
-        end_fragment = pre_value.rfind('\n<!--EndFragment')
-        tmpl_length = len(tmpl) - len('{source}')
+        start_fragment = pre_value.find(HTMLSnippet.START_FRAGMENT_MARKER) + len(
+            HTMLSnippet.START_FRAGMENT_MARKER
+        )
+        end_fragment = pre_value.rfind(HTMLSnippet.END_FRAGMENT_MARKER)
         snippet = cls.__new__(cls)
         snippet.data = tmpl.format_map(locals())
         snippet.headers = cls.parse_headers(snippet.data)

diff --git a/newsfragments/24.bugfix.rst b/newsfragments/24.bugfix.rst
@@ -0,0 +1 @@
+Fixed clipboard handling HTML with multibyte characters.
diff --git a/tests/test_clipboard.py b/tests/test_clipboard.py
@@ -47,6 +47,7 @@ def test_unicode_clipboard():
 @pytest.fixture
 def sample_html():
     with wc.context():
+        wc.EmptyClipboard()
         wc.SetClipboardData(api.CF_HTML, example_html.encode('utf-8'))
 
 
@@ -68,3 +69,42 @@ def test_html_paste(sample_html):
 def test_html_fragment(sample_html):
     snippet = wc.get_html()
     assert snippet.fragment == '<LI> The Fragment </LI>'
+
+
+multibyte_example_html = textwrap.dedent(
+    """
+    Version:0.9
+    StartHTML:00000138
+    EndHTML:00000215
+    StartFragment:00000171
+    EndFragment:00000182
+    StartSelection:00000174
+    EndSelection:00000178
+    <html><body>
+    <!--StartFragment--><p>😀</p><!--EndFragment-->
+    </body></html>
+    """
+).strip()
+
+
+@pytest.fixture
+def multibyte_sample_html():
+    with wc.context():
+        wc.EmptyClipboard()
+        wc.SetClipboardData(api.CF_HTML, multibyte_example_html.encode('utf-8'))
+
+
+def test_html_multibyte_characters(multibyte_sample_html):
+    res = wc.get_html()
+    assert (
+        res.html
+        == textwrap.dedent(
+            """
+            <html><body>
+            <!--StartFragment--><p>😀</p><!--EndFragment-->
+            </body></html>
+            """
+        ).strip()
+    )
+    assert res.fragment == '<p>😀</p>'
+    assert res.selection == '😀'
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		Fixed clipboard handling HTML with multibyte characters.