forked from udieckmann/Kielipankki-utilities
-
Notifications
You must be signed in to change notification settings - Fork 0
/
vrt-split
executable file
·192 lines (169 loc) · 7.22 KB
/
vrt-split
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
#! /usr/bin/env python3
# -*- mode: Python; -*-
"""
vrt-split
Split VRT (horizontally) at text boundaries into multiple output files.
"""
# TODO:
# - Support some kind of transformations for attribute values to be used as
# part of the output filename
import os.path
import re
import sys
import vrtargsoolib
import vrtcommentlib
import vrtnamelib
class Splitter(vrtargsoolib.InputProcessor):
# FIXME (libraries): This class currently needs to be a subclass of
# InputProcessor to work, even though the output options are not relevant.
DESCRIPTION = """
Split VRT input at specified structure boundaries into multiple
output files having at most a specified number of structures or
tokens.
"""
ARGSPECS = [
# ('file',
# 'input VRT file to be split'),
(('--output-filename-template|out-template=templ'
' "{dir}/{basename}-{num1:02d}{ext}" -> out_fname_templ'),
'name output files according to template templ, which may contain'
' the following format keys: from input file: {dir} = directory,'
' {filename} = file name, {basename} = file name without extension,'
' {ext} = extension; from slice number: {num0} = 0-based decimal,'
' {num1} = 1-based decimal, {lower} = lower-case letter, {upper} ='
' upper-case letter, {attr[name]} = the value of attribute name in'
' the structure by which to split the input (the first of them in'
' the output file)'),
('--structure=struct "text" -> struct',
'split the input at the boundaries of structure struct'),
('--keep-together-attribute=attr -> keep_attr',
'keep together consecutive structs with the same value for attribute'
' attr'),
('--max-structs=num :int =10000',
'output at most num structures to a single file; note that the limit'
' may be exceeded if the file name template refers to an attribute'
' value that is not unique among structures'),
('--max-tokens=num :int =0',
'output at most num tokens to a single file; 0 = unlimited; the same'
' note applies as to --max-structs'),
('--omit-split-info',
'do not output a VRT comment informing of having split a VRT file'),
]
class OPTIONS(vrtargsoolib.InputProcessor.OPTIONS):
arg_inplace = False
def __init__(self):
super().__init__()
def main(self, args, inf, ouf):
LESS_THAN = '<'.encode()[0]
fname_comps = dict(
dir=os.path.dirname(args.infile or '') or '.',
filename=os.path.basename(args.infile or '(stdin).vrt'))
fname_comps['basename'], fname_comps['ext'] = os.path.splitext(
fname_comps['filename'])
fname_attrnames = re.findall(r'\{attr\[(.+?)\]', args.out_fname_templ)
fname_attrval_res = dict(
(attrname, b' ' + attrname.encode() + b'="(.*?)"')
for attrname in fname_attrnames)
out_lines = []
attr_comment = b''
head_comment = []
check_keep_attr = args.keep_attr is not None
struct_begins = [b'<' + args.struct.encode() + trail
for trail in [b'>', b' ']]
struct_end = b'</' + args.struct.encode() + b'>'
keep_value = None
keep_value_new = None
keep_value_re = b' ' + (args.keep_attr or '').encode() + b'="(.*?)"'
struct_cnt = 0
token_cnt = 0
prev_struct_cnt = 0
prev_token_cnt = 0
outfile_cnt = 0
outfile_names = set()
outf = None
def get_re_match(regex, line):
mo = re.search(regex, line)
return mo.group(1) if mo else b''
def get_keep_attr_value(line):
return get_re_match(keep_value_re, line)
def get_fname_attr_values(line):
return dict(
(attrname,
get_re_match(fname_attrval_res[attrname], line).decode())
for attrname in fname_attrnames)
def new_outfile():
nonlocal outf, outfile_cnt
if outf is not None:
outf.close()
fname_attrvals = {}
if fname_attrnames:
for line in out_lines:
if (line[0] == LESS_THAN
and line.startswith(struct_begins[1])):
fname_attrvals = get_fname_attr_values(line)
break
# outfile_cnt is used only if a new file is created, so it need not
# be decremented even if appending to an existing file
outfile_cnt = len(outfile_names) + 1
outfile_name = args.out_fname_templ.format(
num0=outfile_cnt - 1,
num1=outfile_cnt,
lower=chr(outfile_cnt + 96),
upper=chr(outfile_cnt + 64),
attr=fname_attrvals,
**fname_comps)
# If the file has already been written to, append to it
mode = 'a' if outfile_name in outfile_names else 'w'
outf = open(outfile_name, mode + 'b')
if outfile_name not in outfile_names:
outf.write(attr_comment)
for line in head_comment:
outf.write(line)
if not args.omit_split_info:
outf.write(vrtcommentlib.makebinvrtcomment(
b'info',
('Split ' + (args.infile or '(stdin)')
+ ': part ' + str(outfile_cnt)
+ ', ' + os.path.basename(outfile_name)).encode()))
outfile_names.add(outfile_name)
return outf
def output():
nonlocal outf, out_lines
nonlocal struct_cnt, token_cnt, prev_struct_cnt, prev_token_cnt
if (outf is None
or (args.max_structs > 0 and struct_cnt > args.max_structs)
or (args.max_tokens > 0 and token_cnt > args.max_tokens)):
outf = new_outfile()
struct_cnt -= prev_struct_cnt
token_cnt -= prev_token_cnt
for line in out_lines:
outf.write(line)
out_lines = []
prev_struct_cnt = struct_cnt
prev_token_cnt = token_cnt
for line in inf:
if not attr_comment and vrtnamelib.isbinnames(line):
attr_comment = line
continue
if (outfile_cnt == 0 and not out_lines
and vrtcommentlib.isbinvrtcomment(line)):
head_comment.append(line)
continue
if line[0] == LESS_THAN:
if (line.startswith(struct_begins[0])
or line.startswith(struct_begins[1])):
if check_keep_attr:
keep_value_new = get_keep_attr_value(line)
if struct_cnt and (keep_value is None
or keep_value_new != keep_value):
output()
keep_value = keep_value_new
struct_cnt += 1
else:
token_cnt += 1
out_lines.append(line)
if out_lines:
output()
outf.close()
if __name__ == '__main__':
Splitter().run()