mdn · lmorchard · Jun 19, 2012 · Jun 18, 2012
diff --git a/apps/wiki/content.py b/apps/wiki/content.py
@@ -118,7 +118,28 @@ def gen_id(self):
 
     def slugify(self, text):
         """Turn the text content of a header into a slug for use in an ID"""
-        return (text.replace(' ', '_'))
+        non_ascii = [c for c in text if ord(c) > 128]
+        if non_ascii:
+            for c in non_ascii:
+                text = text.replace(c, self.encode_non_ascii(c))
+        text = text.replace(' ', '_')
+        return text
+
+    def encode_non_ascii(self, c):
+        # This is slightly gnarly.
+        #
+        # What MindTouch does is basically turn any non-ASCII characters
+        # into UTF-8 codepoints, preceded by a dot.
+        #
+        # This is somewhat tricky in Python because Python's internals are
+        # UCS-2, meaning that Python will give us, essentially, UTF-16
+        # codepoints out of Unicode strings. So, an ugly but functional
+        # hack: encode the offending character UTF-8 and repr that, which
+        # gives us the codepoints preceded by '\x' escape sequences. Then
+        # we can just replace the escape sequence with the dot, uppercase
+        # it, and we have the thing MindTouch would generate.
+        return repr(c.encode('utf-8')).strip("'").replace(r'\x', '.').upper()
+
 
     def __iter__(self):
         input = html5lib_Filter.__iter__(self)

diff --git a/apps/wiki/tests/test_content.py b/apps/wiki/tests/test_content.py
@@ -10,7 +10,7 @@
 from sumo.tests import TestCase
 import wiki.content
 from wiki.content import (CodeSyntaxFilter, DekiscriptMacroFilter,
-                          SectionTOCFilter, SECTION_TAGS)
+                          SectionTOCFilter, SectionIDFilter, SECTION_TAGS)
 from wiki.models import ALLOWED_TAGS, ALLOWED_ATTRIBUTES
 from wiki.tests import normalize_html
 
@@ -338,6 +338,32 @@ def test_code_syntax_conversion(self):
                   .filter(CodeSyntaxFilter).serialize())
         eq_(normalize_html(expected), normalize_html(result))
 
+    def test_non_ascii_section_headers(self):
+        headers = [
+            (u'Documentation à propos de HTML',
+             'Documentation_.C3.A0_propos_de_HTML'),
+            (u'Outils facilitant le développement HTML',
+             'Outils_facilitant_le_d.C3.A9veloppement_HTML'),
+            (u'例:\u00a0スキューと平行移動',
+             '.E4.BE.8B:.C2.A0.E3.82.B9.E3.82.AD.E3.83.A5.E3.83.BC.E3.81.A8.E5.B9.B3.E8.A1.8C.E7.A7.BB.E5.8B.95'),
+            (u'例:\u00a0回転',
+             '.E4.BE.8B:.C2.A0.E5.9B.9E.E8.BB.A2'),
+            (u'Documentação',
+             'Documenta.C3.A7.C3.A3o'),
+            (u'Lektury uzupełniające',
+             'Lektury_uzupe.C5.82niaj.C4.85ce'),
+            (u'Атрибуты',
+             '.D0.90.D1.82.D1.80.D0.B8.D0.B1.D1.83.D1.82.D1.8B'),
+            (u'HTML5 엘리먼트',
+             'HTML5_.EC.97.98.EB.A6.AC.EB.A8.BC.ED.8A.B8'),
+        ]
+
+        section_filter = SectionIDFilter('')
+
+        for original, slugified in headers:
+            ok_(slugified == section_filter.slugify(original))
+
+
     @attr('toc')
     def test_generate_toc(self):
         doc_src = """