forked from evandrocoan/ufscthesisx-setup
-
Notifications
You must be signed in to change notification settings - Fork 1
/
remove_lang.py
268 lines (205 loc) · 9.01 KB
/
remove_lang.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
#!/usr/bin/env python3
# -*- coding: UTF-8 -*-
####################### Licensing #######################################################
#
# Copyright 2019 @ Evandro Coan
# Helper functions and classes
#
# Redistributions of source code must retain the above
# copyright notice, this list of conditions and the
# following disclaimer.
#
# Redistributions in binary form must reproduce the above
# copyright notice, this list of conditions and the following
# disclaimer in the documentation and/or other materials
# provided with the distribution.
#
# Neither the name Evandro Coan nor the names of any
# contributors may be used to endorse or promote products
# derived from this software without specific prior written
# permission.
#
# This program is free software; you can redistribute it and/or modify it
# under the terms of the GNU General Public License as published by the
# Free Software Foundation; either version 3 of the License, or ( at
# your option ) any later version.
#
# This program is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
#########################################################################################
#
import re
import os
import sys
import unittest
try:
import pushdown
import debug_tools
except:
print("Please, install the Python 3 packages `pushdown` and `debug_tools`!")
print("You can run these commands:")
print(" pip3 install pushdown")
print(" pip3 install debug_tools")
print("")
exit(1)
from pushdown import Lark
from pushdown import Tree
from debug_tools import getLogger
from debug_tools.utilities import wrap_text
log = getLogger(3)
def main():
sys.setrecursionlimit(10000)
# unittest.main()
current_directory = os.path.dirname( os.path.realpath( __file__ ) )
parent_directory = os.path.dirname( current_directory )
parse_all_files( parent_directory )
class BasicUnitTests(unittest.TestCase):
def test_simplest_valid_input(self):
results = remove_lang_tag( r"\lang{some}{thing}" )
self.assertEqual( wrap_text(
"""
+ thing
""" ), results )
def test_valid_input_with_nested_langs(self):
results = remove_lang_tag( r"\lang{some\lang{invalide}{stuff}more}{think\lang{stuff}{invalide}lang}" )
self.assertEqual( wrap_text(
r"""
+ think\lang{stuff}{invalide}lang
""" ), results )
def test_simplest_invalid_input(self):
with self.assertRaisesRegex( pushdown.exceptions.UnexpectedToken, "Unexpected token Token" ):
remove_lang_tag( r"\lang{some{}{thing}" )
def test_invalid_input_with_nested_langs(self):
with self.assertRaisesRegex( pushdown.exceptions.UnexpectedToken, "Unexpected token Token" ):
remove_lang_tag( r"\lang{some\lang{{invalide}{stuff}more}{think\lang{stuff}{invalide}lang}" )
def test_big_paragraphs(self):
results = remove_lang_tag( wrap_text( r"""
\preambulo{\lang%
{%
\imprimirtipotrabalho~submitted to the \imprimirprograma~of
\imprimirinstituicao~for degree acquirement in \imprimirformacao.%
}{%
\imprimirtipotrabalho~submetido ao \imprimirprograma~da
\imprimirinstituicao~para a obtenção do Grau de \imprimirformacao.%
}%
}
""" ) )
self.assertEqual( wrap_text(
r"""
+ \preambulo{%
+ \imprimirtipotrabalho~submetido ao \imprimirprograma~da
+ \imprimirinstituicao~para a obtenção do Grau de \imprimirformacao.%
+ %
+ }
""" ), results )
parser = Lark(r'''
start: THINGS_UP_TO_SLASH lang_token_name do_lang_start | THINGS_UP_TO_LANG_NO_SLASH start | THINGS_UP_TO_NO_SLASH?
lang_token_name: LANG_TOKEN
LANG_TOKEN: "lang"
THINGS_UP_TO_SLASH: /[^\\]*\\/
THINGS_UP_TO_NO_SLASH: /[^\\]+/
THINGS_UP_TO_LANG_NO_SLASH: /[^\\]*\\(?!lang)/
do_lang_start: SPACES? english_open_brace recursive_english english_close_brace SPACES? portuguese_open_brace recursive_porguese portuguese_close_brace start
| "}" start
english_open_brace: OPEN_BRACE
english_close_brace: CLOSE_BRACE
portuguese_open_brace: OPEN_BRACE
portuguese_close_brace: CLOSE_BRACE
recursive_english: recursive
recursive_porguese: recursive
recursive: ANYTHINGNONE?
| ANYTHINGNONE? OPEN_BRACE recursive CLOSE_BRACE ANYTHINGNONE?
| recursive OPEN_BRACE recursive CLOSE_BRACE recursive
SPACES: /\s+/
ANYTHINGNONE: /[^{}]+/
OPEN_BRACE: "{"
CLOSE_BRACE: "}"
''',
parser='lalr', lexer='contextual'
)
def remove_lang_tag(fulltext):
newtext = fulltext
# Remove comments because they may have broken \lang{ tags and we are not parsing this
for match in re.finditer(r"%.*", fulltext):
size = match.end() - match.start()
log( 4, 'start', match.start(), 'end', match.end(), 'removing', newtext[match.start():match.end()] )
newtext = newtext[:match.start()] + " " * size + newtext[match.end():]
log( 4, 'newtext', newtext)
tree = parser.parse(newtext)
# print( tree.pretty() )
english_open_brace = -1
portuguese_open_brace = -1
save_ranges = []
delete_ranges = []
def parse_tree(tree, level, children_count):
level_name = tree.data
global english_open_brace
global portuguese_open_brace
for node in tree.children:
if isinstance( node, Tree ):
log( 4, "level: %s, level_name: %-25s children: %s", level, level_name, children_count )
parse_tree( node, level+1, len( node.children ) )
else:
log( 4, "level: %s, level_name: %-25s node: %-8s %s", level, level_name, "`" + str( node ) + "`", node.__class__.__name__ )
if level_name == 'lang_token_name':
english_open_brace = node.pos_in_stream - 1
if level_name == 'portuguese_open_brace':
portuguese_open_brace = node.pos_in_stream + 1
if level_name == 'english_close_brace':
english_close_brace = node.pos_in_stream + 1
delete_ranges.append( (english_open_brace, english_close_brace) )
log( 2, 'saving delete_ranges', '{:<16}'.format(str(delete_ranges[-1])), '%r' % newtext[english_open_brace:english_close_brace])
if level_name == 'portuguese_close_brace':
portuguese_close_brace = node.pos_in_stream + 1
save_ranges.append( (portuguese_open_brace, portuguese_close_brace) )
log( 2, 'saving save_ranges', '{:<16}'.format(str(save_ranges[-1])), '%r' % newtext[portuguese_open_brace:portuguese_close_brace])
parse_tree( tree, 0, len( tree.children ) )
if save_ranges:
log.clean(2, "")
save_ranges = list(reversed(save_ranges))
delete_ranges = list(reversed(delete_ranges))
log( 2, 'Saving all Portuguese \\langs...' )
for index, thing in enumerate(save_ranges):
start, end = thing
saved = fulltext[start:end-1]
log( 2, 'start', start, 'end', end, 'saving', saved )
start, _ = delete_ranges[index]
deleted = fulltext[start:end]
log( 2, 'start', start, 'end', end, 'deleting', fulltext[start:end] )
fulltext = fulltext[:start] + saved + fulltext[end:]
# log( 2, 'Remove all English \\langs...' )
# for first, second in reversed(save_ranges):
# log( 2, 'start', first, 'end', second, 'removing', fulltext[first], fulltext[second] )
# fulltext = fulltext[:first] + " " + fulltext[first+1:]
# fulltext = fulltext[:second] + " " + fulltext[second+1:]
# for start, end in reversed(delete_ranges):
# size = end - start
# log( 2, 'start', start, 'end', end, 'removing', fulltext[start:end] )
# fulltext = fulltext[:start] + fulltext[end:]
return fulltext
def parse_all_files(current_directory):
log( 2, "Packing files on %s" % current_directory )
for direcory_name, dirs, files in os.walk(current_directory, followlinks=True):
for filename in files:
filepath = os.path.join( direcory_name, filename )
if ".git" in filepath or not filepath.endswith( ".tex" ):
continue
with open(filepath, 'rb') as file:
contents = file.read().decode('utf-8')
newcontents = remove_lang_tag(contents)
if newcontents != contents:
log.clean("")
log.clean("")
log.clean("")
log( 2, 'Processing file', filepath)
with open(filepath, 'wb') as file:
file.write(newcontents.encode())
log(1, "Done!" )
if __name__ == "__main__":
main()