diff --git a/jaraco/windows/clipboard.py b/jaraco/windows/clipboard.py index 626c602..b71d94c 100644 --- a/jaraco/windows/clipboard.py +++ b/jaraco/windows/clipboard.py @@ -82,24 +82,60 @@ class HTMLSnippet(object): `_. """ + # The non-normative grammar defines markers with a space before "-->", + # but the body text defines them without spaces and states "with no + # whitespace chars within each comment itself". + # Actually, Edge and Firefox don't include spaces, so we also don't include + # a space. + START_FRAGMENT_MARKER = b'' + END_FRAGMENT_MARKER = b'' + def __init__(self, handle): self.data = nts(raw_data(handle).decode('utf-8')) self.headers = self.parse_headers(self.data) @property def html(self): - return self.data[self.headers['StartHTML'] :] + start_html = self.headers['StartHTML'] + end_html = self.headers['EndHTML'] + if start_html == -1: + start_html = self.headers['StartFragment'] - len( + HTMLSnippet.START_FRAGMENT_MARKER + ) + if end_html == -1: + end_html = self.headers['EndFragment'] + len( + HTMLSnippet.END_FRAGMENT_MARKER + ) + return self.__slice_data(start_html, end_html) @property def fragment(self): - return self.data[self.headers['StartFragment'] : self.headers['EndFragment']] + start_fragment = self.headers['StartFragment'] + end_fragment = self.headers['EndFragment'] + return self.__slice_data(start_fragment, end_fragment) + + @property + def selection(self): + start_selection = ( + self.headers.get('StartSelection') or self.headers['StartFragment'] + ) + end_selection = self.headers.get('EndSelection') or self.headers['EndFragment'] + return self.__slice_data(start_selection, end_selection) + + def __slice_data(self, start, end): + return self.data.encode('utf-8')[start:end].decode('utf-8') @staticmethod def parse_headers(data): - d = io.StringIO(data) + d = io.StringIO(data, None) + + if not re.match(r'Version:(?:0\.9|1\.0)\n', d.readline()): + raise ValueError('Unsupported format') + + d.seek(0) def header_line(line): - return re.match(r'(\w+):(.*)', line) + return re.match(r'(\w+):(.*)\n', line) headers = map(header_line, d) # grab headers until they no longer match @@ -133,20 +169,19 @@ def from_string(cls, source): StartFragment:{start_fragment:08d} EndFragment:{end_fragment:08d} - - {source} - + {source} """ ).strip() zeros = collections.defaultdict(lambda: 0, locals()) - pre_value = tmpl.format_map(zeros) - start_html = pre_value.find('') - end_html = len(tmpl) + pre_value = tmpl.format_map(zeros).encode('utf-8') + start_html = pre_value.find(b'') + end_html = len(pre_value) assert end_html < 100000000 - start_fragment = pre_value.find(source) - end_fragment = pre_value.rfind('\n

😀

+ + """ +).strip() + + +@pytest.fixture +def multibyte_sample_html(): + with wc.context(): + wc.EmptyClipboard() + wc.SetClipboardData(api.CF_HTML, multibyte_example_html.encode('utf-8')) + + +def test_html_multibyte_characters(multibyte_sample_html): + res = wc.get_html() + assert ( + res.html + == textwrap.dedent( + """ + +

😀

+ + """ + ).strip() + ) + assert res.fragment == '

😀

' + assert res.selection == '😀'