Skip to content

Commit

Permalink
Merge pull request #25 from taku0/fix_clipboard_handling_html_with_mu…
Browse files Browse the repository at this point in the history
…ltibyte_characters

Fix clipboard handling HTML with multibyte characters
  • Loading branch information
jaraco authored Sep 21, 2024
2 parents 65cacd7 + 627f8a4 commit fe92782
Show file tree
Hide file tree
Showing 3 changed files with 89 additions and 13 deletions.
61 changes: 48 additions & 13 deletions jaraco/windows/clipboard.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,24 +82,60 @@ class HTMLSnippet(object):
<https://docs.microsoft.com/en-us/windows/win32/dataxchg/html-clipboard-format>`_.
"""

# The non-normative grammar defines markers with a space before "-->",
# but the body text defines them without spaces and states "with no
# whitespace chars within each comment itself".
# Actually, Edge and Firefox don't include spaces, so we also don't include
# a space.
START_FRAGMENT_MARKER = b'<!--StartFragment-->'
END_FRAGMENT_MARKER = b'<!--EndFragment-->'

def __init__(self, handle):
self.data = nts(raw_data(handle).decode('utf-8'))
self.headers = self.parse_headers(self.data)

@property
def html(self):
return self.data[self.headers['StartHTML'] :]
start_html = self.headers['StartHTML']
end_html = self.headers['EndHTML']
if start_html == -1:
start_html = self.headers['StartFragment'] - len(
HTMLSnippet.START_FRAGMENT_MARKER
)
if end_html == -1:
end_html = self.headers['EndFragment'] + len(
HTMLSnippet.END_FRAGMENT_MARKER
)
return self.__slice_data(start_html, end_html)

@property
def fragment(self):
return self.data[self.headers['StartFragment'] : self.headers['EndFragment']]
start_fragment = self.headers['StartFragment']
end_fragment = self.headers['EndFragment']
return self.__slice_data(start_fragment, end_fragment)

@property
def selection(self):
start_selection = (
self.headers.get('StartSelection') or self.headers['StartFragment']
)
end_selection = self.headers.get('EndSelection') or self.headers['EndFragment']
return self.__slice_data(start_selection, end_selection)

def __slice_data(self, start, end):
return self.data.encode('utf-8')[start:end].decode('utf-8')

@staticmethod
def parse_headers(data):
d = io.StringIO(data)
d = io.StringIO(data, None)

if not re.match(r'Version:(?:0\.9|1\.0)\n', d.readline()):
raise ValueError('Unsupported format')

d.seek(0)

def header_line(line):
return re.match(r'(\w+):(.*)', line)
return re.match(r'(\w+):(.*)\n', line)

headers = map(header_line, d)
# grab headers until they no longer match
Expand Down Expand Up @@ -133,20 +169,19 @@ def from_string(cls, source):
StartFragment:{start_fragment:08d}
EndFragment:{end_fragment:08d}
<html><body>
<!--StartFragment -->
{source}
<!--EndFragment -->
<!--StartFragment-->{source}<!--EndFragment-->
</body></html>
"""
).strip()
zeros = collections.defaultdict(lambda: 0, locals())
pre_value = tmpl.format_map(zeros)
start_html = pre_value.find('<html>')
end_html = len(tmpl)
pre_value = tmpl.format_map(zeros).encode('utf-8')
start_html = pre_value.find(b'<html>')
end_html = len(pre_value)
assert end_html < 100000000
start_fragment = pre_value.find(source)
end_fragment = pre_value.rfind('\n<!--EndFragment')
tmpl_length = len(tmpl) - len('{source}')
start_fragment = pre_value.find(HTMLSnippet.START_FRAGMENT_MARKER) + len(
HTMLSnippet.START_FRAGMENT_MARKER
)
end_fragment = pre_value.rfind(HTMLSnippet.END_FRAGMENT_MARKER)
snippet = cls.__new__(cls)
snippet.data = tmpl.format_map(locals())
snippet.headers = cls.parse_headers(snippet.data)
Expand Down
1 change: 1 addition & 0 deletions newsfragments/24.bugfix.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Fixed clipboard handling HTML with multibyte characters.
40 changes: 40 additions & 0 deletions tests/test_clipboard.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ def test_unicode_clipboard():
@pytest.fixture
def sample_html():
with wc.context():
wc.EmptyClipboard()
wc.SetClipboardData(api.CF_HTML, example_html.encode('utf-8'))


Expand All @@ -68,3 +69,42 @@ def test_html_paste(sample_html):
def test_html_fragment(sample_html):
snippet = wc.get_html()
assert snippet.fragment == '<LI> The Fragment </LI>'


multibyte_example_html = textwrap.dedent(
"""
Version:0.9
StartHTML:00000138
EndHTML:00000215
StartFragment:00000171
EndFragment:00000182
StartSelection:00000174
EndSelection:00000178
<html><body>
<!--StartFragment--><p>😀</p><!--EndFragment-->
</body></html>
"""
).strip()


@pytest.fixture
def multibyte_sample_html():
with wc.context():
wc.EmptyClipboard()
wc.SetClipboardData(api.CF_HTML, multibyte_example_html.encode('utf-8'))


def test_html_multibyte_characters(multibyte_sample_html):
res = wc.get_html()
assert (
res.html
== textwrap.dedent(
"""
<html><body>
<!--StartFragment--><p>😀</p><!--EndFragment-->
</body></html>
"""
).strip()
)
assert res.fragment == '<p>😀</p>'
assert res.selection == '😀'

0 comments on commit fe92782

Please sign in to comment.