Skip to content
This repository has been archived by the owner on Aug 26, 2022. It is now read-only.

Bug 747403 -- refine section ID generation #283

Merged
merged 1 commit into from
Jun 19, 2012
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 22 additions & 1 deletion apps/wiki/content.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,7 +118,28 @@ def gen_id(self):

def slugify(self, text):
"""Turn the text content of a header into a slug for use in an ID"""
return (text.replace(' ', '_'))
non_ascii = [c for c in text if ord(c) > 128]
if non_ascii:
for c in non_ascii:
text = text.replace(c, self.encode_non_ascii(c))
text = text.replace(' ', '_')
return text

def encode_non_ascii(self, c):
# This is slightly gnarly.
#
# What MindTouch does is basically turn any non-ASCII characters
# into UTF-8 codepoints, preceded by a dot.
#
# This is somewhat tricky in Python because Python's internals are
# UCS-2, meaning that Python will give us, essentially, UTF-16
# codepoints out of Unicode strings. So, an ugly but functional
# hack: encode the offending character UTF-8 and repr that, which
# gives us the codepoints preceded by '\x' escape sequences. Then
# we can just replace the escape sequence with the dot, uppercase
# it, and we have the thing MindTouch would generate.
return repr(c.encode('utf-8')).strip("'").replace(r'\x', '.').upper()


def __iter__(self):
input = html5lib_Filter.__iter__(self)
Expand Down
28 changes: 27 additions & 1 deletion apps/wiki/tests/test_content.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from sumo.tests import TestCase
import wiki.content
from wiki.content import (CodeSyntaxFilter, DekiscriptMacroFilter,
SectionTOCFilter, SECTION_TAGS)
SectionTOCFilter, SectionIDFilter, SECTION_TAGS)
from wiki.models import ALLOWED_TAGS, ALLOWED_ATTRIBUTES
from wiki.tests import normalize_html

Expand Down Expand Up @@ -338,6 +338,32 @@ def test_code_syntax_conversion(self):
.filter(CodeSyntaxFilter).serialize())
eq_(normalize_html(expected), normalize_html(result))

def test_non_ascii_section_headers(self):
headers = [
(u'Documentation à propos de HTML',
'Documentation_.C3.A0_propos_de_HTML'),
(u'Outils facilitant le développement HTML',
'Outils_facilitant_le_d.C3.A9veloppement_HTML'),
(u'例:\u00a0スキューと平行移動',
'.E4.BE.8B:.C2.A0.E3.82.B9.E3.82.AD.E3.83.A5.E3.83.BC.E3.81.A8.E5.B9.B3.E8.A1.8C.E7.A7.BB.E5.8B.95'),
(u'例:\u00a0回転',
'.E4.BE.8B:.C2.A0.E5.9B.9E.E8.BB.A2'),
(u'Documentação',
'Documenta.C3.A7.C3.A3o'),
(u'Lektury uzupełniające',
'Lektury_uzupe.C5.82niaj.C4.85ce'),
(u'Атрибуты',
'.D0.90.D1.82.D1.80.D0.B8.D0.B1.D1.83.D1.82.D1.8B'),
(u'HTML5 엘리먼트',
'HTML5_.EC.97.98.EB.A6.AC.EB.A8.BC.ED.8A.B8'),
]

section_filter = SectionIDFilter('')

for original, slugified in headers:
ok_(slugified == section_filter.slugify(original))


@attr('toc')
def test_generate_toc(self):
doc_src = """
Expand Down