-
Notifications
You must be signed in to change notification settings - Fork 0
/
pelican_cjk.py
142 lines (104 loc) · 4.7 KB
/
pelican_cjk.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
import os
import re
from pelican import signals
def ranges_as_regex(ranges):
pattern_text = '['
for start, end in ranges:
if end == '':
pattern_text += start
else:
pattern_text += '{}-{}'.format(start, end)
pattern_text += ']'
return pattern_text
CJK_RANGES = (
# References: https://en.wikipedia.org/wiki/CJK_Unified_Ideographs
(u'\u4E00', u'\u9FFF'), # CJK Unified Ideographs
(u'\u3400', u'\u4DBF'), # CJK Unified Ideographs Ext. A
(u'\U00020000', u'\U0002A6DF'), # CJK Unified Ideographs Ext. B
(u'\U0002A700', u'\U0002B73F'), # CJK Unified Ideographs Ext. C
(u'\U0002B740', u'\U0002B81F'), # CJK Unified Ideographs Ext. D
(u'\U0002B820', u'\U0002CEAF'), # CJK Unified Ideographs Ext. E
(u'\U0002CEB0', u'\U0002EBEF'), # CJK Unified Ideographs Ext. F
(u'\uF900', u'\uFAFF'), # CJK Compatibility Ideographs
(u'\U0002F800', u'\U0002FA1F'), # CJK Compatibility Ideographs Supplement
# References: https://stackoverflow.com/a/30200250
(u'\u3040', u'\u309F'), # Hiragana
(u'\u30A0', u'\u30FF'), # Katakana
(u'\u4E00', u'\u9FAF'), # CJK unified Ideographs
# References: https://en.wikipedia.org/wiki/Korean_language_and_computers#Hangul_in_Unicode
(u'\uAC00', u'\uD7A3'), # Hangul Syllables
(u'\u1100', u'\u11FF'), # Hangul Jamo
(u'\u3130', u'\u318F'), # Hangul Compatibility Jamo
(u'\uA960', u'\uA97F'), # Hangul Jamo Ext. A
(u'\uD7B0', u'\uD7FF'), # Hangul Jamo Ext. B
)
PUNC_RANGES = (
(u'\u3000', u'\u303F'), # CJK Symbols and Punctuation
(u'\uFF00', u'\uFFEF'), # Halfwidth and Fullwidth Forms
)
CJK_PUNC_RANGES = CJK_RANGES + PUNC_RANGES
# Alphabets, Numbers and Symbols
ANS_RANGES = (
('a', 'z'),
('A', 'Z'),
('0', '9'),
("'", ''), # Single quote
(r'`~@#%"/_=,:\!\$\^\*\(\)\-\+\[\]\{\}\\\|\.\?', ''), # Characters <>&;' are removed
(u'\u00A1', u'\u00FF'),
(u'\u0370', u'\u03FF'), # Greek and Coptic
)
# Patterns for finding the newline character that should removed.
newline_pattern = re.compile(r'({cjk_punc}){newline}(?=({cjk_punc}))'.format(
cjk_punc=ranges_as_regex(CJK_PUNC_RANGES), newline=os.linesep))
punc_ans_pattern = re.compile(r'({}){}(?=({}))'.format(
ranges_as_regex(PUNC_RANGES), os.linesep, ranges_as_regex(ANS_RANGES)))
ans_punc_pattern = re.compile(r'({}){}(?=({}))'.format(
ranges_as_regex(ANS_RANGES), os.linesep, ranges_as_regex(PUNC_RANGES)))
# Patterns for finding CJK and ANS written without space in between.
# The <[^<]*?> is to NOT include more than one tag.
# Noted that the CJK range does not include CJK punctuation.
cjk_ans_pattern = re.compile(r'({})(<[^<]*?>)?({})'.format(
ranges_as_regex(CJK_RANGES), ranges_as_regex(ANS_RANGES)))
ans_cjk_pattern = re.compile(r'({})(<[^<]*?>)?({})'.format(
ranges_as_regex(ANS_RANGES), ranges_as_regex(CJK_RANGES)))
# Patterns for finding CJK-tag-CJK
# The <[^<]*?> is to NOT include more than one tag.
cjk_tag_cjk_pattern = re.compile(r'(?<={cjk_punc}) *(<[^<]*?>) *(?={cjk_punc})'.format(
cjk_punc=ranges_as_regex(CJK_PUNC_RANGES)))
def remove_paragraph_newline(text):
ret = text
ret = newline_pattern.sub(r'\1', ret)
ret = punc_ans_pattern.sub(r'\1', ret)
ret = ans_punc_pattern.sub(r'\1', ret)
return ret
def auto_spacing(text):
def add_space(match):
"""Add the space to the correct place with regards to the match object.
"""
first_char, tag, second_char = match.groups()
if tag is None: # No HTML tag between first and second char
return first_char + ' ' + second_char
else:
if tag.startswith('</'): # End tag, space should be behind this tag
return first_char + tag + ' ' + second_char
else:
return first_char + ' ' + tag + second_char
ret = text
ret = cjk_ans_pattern.sub(add_space, ret)
ret = ans_cjk_pattern.sub(add_space, ret)
return ret
def remove_markup_spacing(text):
return cjk_tag_cjk_pattern.sub(r'\1', text)
def main(content):
if content._content is None:
return
ret = content._content
if content.settings.get('CJK_REMOVE_PARAGRAPH_NEWLINE', True) is True:
ret = remove_paragraph_newline(ret)
if content.settings.get('CJK_AUTO_SPACING', True) is True:
ret = auto_spacing(ret)
if content.settings.get('CJK_REMOVE_MARKUP_SPACING', True) is True:
ret = remove_markup_spacing(ret)
content._content = ret
def register():
signals.content_object_init.connect(main)