From f3594ea6beabd3a452085b6cd7363a44a475be2a Mon Sep 17 00:00:00 2001
From: James Bennett <james@b-list.org>
Date: Mon, 18 Jun 2012 12:02:23 -0500
Subject: [PATCH] Bug 747403 -- refine section ID generation

This is a first step, namely matching MindTouch behavior for sections
whose names contain non-ASCII characters. We now generate IDs in a
similar way: any section name which contains only ASCII content merely
has spaces replaced with underscores. A name which contains non-ASCII
characters has each such character replaced by hexadecimal digits
representing the appropriate UTF-8 codepoint(s), with each set of
digits preceded by a dot.

The test cases are a sampling of non-ASCII and mixed-character-set
section names and the slugs MindTouch generates for them.

This does not guarantee absolute parity with MindTouch, but probably
gets us close enough. It also does not deal with the problem of a
document in which not all section names are unique, but so far as I
can tell this does not introduce any new problems, merely perpeturts
an old one, assuming any such documents exist.
---
 apps/wiki/content.py            | 23 ++++++++++++++++++++++-
 apps/wiki/tests/test_content.py | 28 +++++++++++++++++++++++++++-
 2 files changed, 49 insertions(+), 2 deletions(-)

diff --git a/apps/wiki/content.py b/apps/wiki/content.py
index f1197f61d6b..323fd76c5cb 100644
--- a/apps/wiki/content.py
+++ b/apps/wiki/content.py
@@ -118,7 +118,28 @@ def gen_id(self):
 
     def slugify(self, text):
         """Turn the text content of a header into a slug for use in an ID"""
-        return (text.replace(' ', '_'))
+        non_ascii = [c for c in text if ord(c) > 128]
+        if non_ascii:
+            for c in non_ascii:
+                text = text.replace(c, self.encode_non_ascii(c))
+        text = text.replace(' ', '_')
+        return text
+
+    def encode_non_ascii(self, c):
+        # This is slightly gnarly.
+        #
+        # What MindTouch does is basically turn any non-ASCII characters
+        # into UTF-8 codepoints, preceded by a dot.
+        #
+        # This is somewhat tricky in Python because Python's internals are
+        # UCS-2, meaning that Python will give us, essentially, UTF-16
+        # codepoints out of Unicode strings. So, an ugly but functional
+        # hack: encode the offending character UTF-8 and repr that, which
+        # gives us the codepoints preceded by '\x' escape sequences. Then
+        # we can just replace the escape sequence with the dot, uppercase
+        # it, and we have the thing MindTouch would generate.
+        return repr(c.encode('utf-8')).strip("'").replace(r'\x', '.').upper()
+
 
     def __iter__(self):
         input = html5lib_Filter.__iter__(self)
diff --git a/apps/wiki/tests/test_content.py b/apps/wiki/tests/test_content.py
index 783745c3535..4bfed37289f 100644
--- a/apps/wiki/tests/test_content.py
+++ b/apps/wiki/tests/test_content.py
@@ -10,7 +10,7 @@
 from sumo.tests import TestCase
 import wiki.content
 from wiki.content import (CodeSyntaxFilter, DekiscriptMacroFilter,
-                          SectionTOCFilter, SECTION_TAGS)
+                          SectionTOCFilter, SectionIDFilter, SECTION_TAGS)
 from wiki.models import ALLOWED_TAGS, ALLOWED_ATTRIBUTES
 from wiki.tests import normalize_html
 
@@ -338,6 +338,32 @@ def test_code_syntax_conversion(self):
                   .filter(CodeSyntaxFilter).serialize())
         eq_(normalize_html(expected), normalize_html(result))
 
+    def test_non_ascii_section_headers(self):
+        headers = [
+            (u'Documentation à propos de HTML',
+             'Documentation_.C3.A0_propos_de_HTML'),
+            (u'Outils facilitant le développement HTML',
+             'Outils_facilitant_le_d.C3.A9veloppement_HTML'),
+            (u'例:\u00a0スキューと平行移動',
+             '.E4.BE.8B:.C2.A0.E3.82.B9.E3.82.AD.E3.83.A5.E3.83.BC.E3.81.A8.E5.B9.B3.E8.A1.8C.E7.A7.BB.E5.8B.95'),
+            (u'例:\u00a0回転',
+             '.E4.BE.8B:.C2.A0.E5.9B.9E.E8.BB.A2'),
+            (u'Documentação',
+             'Documenta.C3.A7.C3.A3o'),
+            (u'Lektury uzupełniające',
+             'Lektury_uzupe.C5.82niaj.C4.85ce'),
+            (u'Атрибуты',
+             '.D0.90.D1.82.D1.80.D0.B8.D0.B1.D1.83.D1.82.D1.8B'),
+            (u'HTML5 엘리먼트',
+             'HTML5_.EC.97.98.EB.A6.AC.EB.A8.BC.ED.8A.B8'),
+        ]
+
+        section_filter = SectionIDFilter('')
+
+        for original, slugified in headers:
+            ok_(slugified == section_filter.slugify(original))
+
+
     @attr('toc')
     def test_generate_toc(self):
         doc_src = """