diff --git a/apps/wiki/content.py b/apps/wiki/content.py index f1197f61d6b..323fd76c5cb 100644 --- a/apps/wiki/content.py +++ b/apps/wiki/content.py @@ -118,7 +118,28 @@ def gen_id(self): def slugify(self, text): """Turn the text content of a header into a slug for use in an ID""" - return (text.replace(' ', '_')) + non_ascii = [c for c in text if ord(c) > 128] + if non_ascii: + for c in non_ascii: + text = text.replace(c, self.encode_non_ascii(c)) + text = text.replace(' ', '_') + return text + + def encode_non_ascii(self, c): + # This is slightly gnarly. + # + # What MindTouch does is basically turn any non-ASCII characters + # into UTF-8 codepoints, preceded by a dot. + # + # This is somewhat tricky in Python because Python's internals are + # UCS-2, meaning that Python will give us, essentially, UTF-16 + # codepoints out of Unicode strings. So, an ugly but functional + # hack: encode the offending character UTF-8 and repr that, which + # gives us the codepoints preceded by '\x' escape sequences. Then + # we can just replace the escape sequence with the dot, uppercase + # it, and we have the thing MindTouch would generate. + return repr(c.encode('utf-8')).strip("'").replace(r'\x', '.').upper() + def __iter__(self): input = html5lib_Filter.__iter__(self) diff --git a/apps/wiki/tests/test_content.py b/apps/wiki/tests/test_content.py index 783745c3535..4bfed37289f 100644 --- a/apps/wiki/tests/test_content.py +++ b/apps/wiki/tests/test_content.py @@ -10,7 +10,7 @@ from sumo.tests import TestCase import wiki.content from wiki.content import (CodeSyntaxFilter, DekiscriptMacroFilter, - SectionTOCFilter, SECTION_TAGS) + SectionTOCFilter, SectionIDFilter, SECTION_TAGS) from wiki.models import ALLOWED_TAGS, ALLOWED_ATTRIBUTES from wiki.tests import normalize_html @@ -338,6 +338,32 @@ def test_code_syntax_conversion(self): .filter(CodeSyntaxFilter).serialize()) eq_(normalize_html(expected), normalize_html(result)) + def test_non_ascii_section_headers(self): + headers = [ + (u'Documentation à propos de HTML', + 'Documentation_.C3.A0_propos_de_HTML'), + (u'Outils facilitant le développement HTML', + 'Outils_facilitant_le_d.C3.A9veloppement_HTML'), + (u'例:\u00a0スキューと平行移動', + '.E4.BE.8B:.C2.A0.E3.82.B9.E3.82.AD.E3.83.A5.E3.83.BC.E3.81.A8.E5.B9.B3.E8.A1.8C.E7.A7.BB.E5.8B.95'), + (u'例:\u00a0回転', + '.E4.BE.8B:.C2.A0.E5.9B.9E.E8.BB.A2'), + (u'Documentação', + 'Documenta.C3.A7.C3.A3o'), + (u'Lektury uzupełniające', + 'Lektury_uzupe.C5.82niaj.C4.85ce'), + (u'Атрибуты', + '.D0.90.D1.82.D1.80.D0.B8.D0.B1.D1.83.D1.82.D1.8B'), + (u'HTML5 엘리먼트', + 'HTML5_.EC.97.98.EB.A6.AC.EB.A8.BC.ED.8A.B8'), + ] + + section_filter = SectionIDFilter('') + + for original, slugified in headers: + ok_(slugified == section_filter.slugify(original)) + + @attr('toc') def test_generate_toc(self): doc_src = """