forked from fxcoudert/tools
-
Notifications
You must be signed in to change notification settings - Fork 0
/
texaccents
executable file
·136 lines (119 loc) · 3.03 KB
/
texaccents
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
#
# texaccents
#
# purpose: remove non-ASCII characters and replace them with TeX codes
# license: MIT License
# author: François-Xavier Coudert
# e-mail: fxcoudert@gmail.com
#
# Table of combining accents and their LaTeX equivalent
combiningAccents = {
"\u0300": "\\`", # grave
"\u0301": "\\'", # acute
"\u0302": "\\^", # circumflex
"\u0303": "\\~", # tilde
"\u0304": "\\=", # macron
"\u0306": "\\u", # breve
"\u0307": "\\.", # dot above
"\u0308": '\\"', # diaresis
"\u030A": '\\r', # ring above
"\u030B": '\\H', # double acute
"\u030C": '\\v', # caron
"\u0323": '\\d', # dot below
"\u0327": '\\c', # cedilla
"\u0328": '\\k', # ogonek
"\u0331": '\\b', # macron below
}
# Replacement for non-ASCII characters
nonASCII = {
"æ": "{\\ae}",
"Æ": "{\\AE}",
"ð": "{\\dh}",
"Ð": "{\\DH}",
"þ": "{\\th}",
"Þ": "{\\TH}",
"ı": "{\\i}",
"ȷ": "{\\j}",
"ł": "{\\l}",
"Ł": "{\\L}",
"Ŋ": "{\\NG}",
"ŋ": "{\\ng}",
"œ": "{\\oe}",
"Œ": "{\\OE}",
"ø": "{\\o}",
"Ø": "{\\O}",
"ß": "{\\ss}",
"–": "--", # Dashes
"—": "---",
"−": "--",
"¡": "{!`}", # Punctuation
"¿": "{?`}",
"·": "\\cdotp",
"≥": "$\\ge$", # Math operators
"≤": "$\\le$",
"≠": "$\\neq$",
"∼": "\\~",
"©": "\\copyright", # Misc
"°": "$\\deg$",
"α": "$\\alpha$", # Greek
"β": "$\\beta$",
"γ": "$\\gamma$",
"δ": "$\\delta$",
"ε": "$\\epsilon$",
"η": "$\\eta$",
"θ": "$\\theta$",
"λ": "$\\lambda$",
"µ": "$\\mu$",
"ν": "$\\nu$",
"π": "$\\pi$",
"σ": "$\\sigma$",
"τ": "$\\tau$",
"φ": "$\\phi$",
"χ": "$\\chi$",
"ψ": "$\\psi$",
"ω": "$\\omega$",
"‘": "`", # Quotes
"’": "'",
"“": "``",
"”": "''",
"′": "$^\\prime$",
"‚": ",", # Weird characters
"„": ",,",
"\xa0": " ", # Unprintable characters
}
def replace_accents(s):
"""Replaces combining accents by corresponding TeX macros"""
import unicodedata
s = unicodedata.normalize('NFD', s)
res = ""
b = s[0]
for c in s[1:]:
if unicodedata.combining(c) and c in combiningAccents:
if b == "i" or b == "j":
b = "\\" + b
b = combiningAccents[c] + "{" + b + "}"
else:
res += b
b = c
return unicodedata.normalize('NFC', res + b)
def texaccents(s):
"""Replaces non-ASCII characters in string by TeX commands"""
s = "".join([nonASCII[c] if c in nonASCII else c for c in s])
s = replace_accents(s)
return s
def main():
import codecs
import locale
import sys
# Set our output to the right encoding if none was chosen
if sys.stdout.encoding is None:
sys.stdout = codecs.getwriter(locale.getpreferredencoding())(sys.stdout)
if len(sys.argv) > 1:
print("\nUsage: texaccents <infile >outfile\n")
sys.exit(1)
for s in sys.stdin.readlines():
print(texaccents(s), end='')
if __name__ == '__main__':
main()