From f3594ea6beabd3a452085b6cd7363a44a475be2a Mon Sep 17 00:00:00 2001 From: James Bennett Date: Mon, 18 Jun 2012 12:02:23 -0500 Subject: [PATCH] Bug 747403 -- refine section ID generation This is a first step, namely matching MindTouch behavior for sections whose names contain non-ASCII characters. We now generate IDs in a similar way: any section name which contains only ASCII content merely has spaces replaced with underscores. A name which contains non-ASCII characters has each such character replaced by hexadecimal digits representing the appropriate UTF-8 codepoint(s), with each set of digits preceded by a dot. The test cases are a sampling of non-ASCII and mixed-character-set section names and the slugs MindTouch generates for them. This does not guarantee absolute parity with MindTouch, but probably gets us close enough. It also does not deal with the problem of a document in which not all section names are unique, but so far as I can tell this does not introduce any new problems, merely perpeturts an old one, assuming any such documents exist. --- apps/wiki/content.py | 23 ++++++++++++++++++++++- apps/wiki/tests/test_content.py | 28 +++++++++++++++++++++++++++- 2 files changed, 49 insertions(+), 2 deletions(-) diff --git a/apps/wiki/content.py b/apps/wiki/content.py index f1197f61d6b..323fd76c5cb 100644 --- a/apps/wiki/content.py +++ b/apps/wiki/content.py @@ -118,7 +118,28 @@ def gen_id(self): def slugify(self, text): """Turn the text content of a header into a slug for use in an ID""" - return (text.replace(' ', '_')) + non_ascii = [c for c in text if ord(c) > 128] + if non_ascii: + for c in non_ascii: + text = text.replace(c, self.encode_non_ascii(c)) + text = text.replace(' ', '_') + return text + + def encode_non_ascii(self, c): + # This is slightly gnarly. + # + # What MindTouch does is basically turn any non-ASCII characters + # into UTF-8 codepoints, preceded by a dot. + # + # This is somewhat tricky in Python because Python's internals are + # UCS-2, meaning that Python will give us, essentially, UTF-16 + # codepoints out of Unicode strings. So, an ugly but functional + # hack: encode the offending character UTF-8 and repr that, which + # gives us the codepoints preceded by '\x' escape sequences. Then + # we can just replace the escape sequence with the dot, uppercase + # it, and we have the thing MindTouch would generate. + return repr(c.encode('utf-8')).strip("'").replace(r'\x', '.').upper() + def __iter__(self): input = html5lib_Filter.__iter__(self) diff --git a/apps/wiki/tests/test_content.py b/apps/wiki/tests/test_content.py index 783745c3535..4bfed37289f 100644 --- a/apps/wiki/tests/test_content.py +++ b/apps/wiki/tests/test_content.py @@ -10,7 +10,7 @@ from sumo.tests import TestCase import wiki.content from wiki.content import (CodeSyntaxFilter, DekiscriptMacroFilter, - SectionTOCFilter, SECTION_TAGS) + SectionTOCFilter, SectionIDFilter, SECTION_TAGS) from wiki.models import ALLOWED_TAGS, ALLOWED_ATTRIBUTES from wiki.tests import normalize_html @@ -338,6 +338,32 @@ def test_code_syntax_conversion(self): .filter(CodeSyntaxFilter).serialize()) eq_(normalize_html(expected), normalize_html(result)) + def test_non_ascii_section_headers(self): + headers = [ + (u'Documentation à propos de HTML', + 'Documentation_.C3.A0_propos_de_HTML'), + (u'Outils facilitant le développement HTML', + 'Outils_facilitant_le_d.C3.A9veloppement_HTML'), + (u'例:\u00a0スキューと平行移動', + '.E4.BE.8B:.C2.A0.E3.82.B9.E3.82.AD.E3.83.A5.E3.83.BC.E3.81.A8.E5.B9.B3.E8.A1.8C.E7.A7.BB.E5.8B.95'), + (u'例:\u00a0回転', + '.E4.BE.8B:.C2.A0.E5.9B.9E.E8.BB.A2'), + (u'Documentação', + 'Documenta.C3.A7.C3.A3o'), + (u'Lektury uzupełniające', + 'Lektury_uzupe.C5.82niaj.C4.85ce'), + (u'Атрибуты', + '.D0.90.D1.82.D1.80.D0.B8.D0.B1.D1.83.D1.82.D1.8B'), + (u'HTML5 엘리먼트', + 'HTML5_.EC.97.98.EB.A6.AC.EB.A8.BC.ED.8A.B8'), + ] + + section_filter = SectionIDFilter('') + + for original, slugified in headers: + ok_(slugified == section_filter.slugify(original)) + + @attr('toc') def test_generate_toc(self): doc_src = """