forked from udieckmann/Kielipankki-utilities
-
Notifications
You must be signed in to change notification settings - Fork 0
/
vrt-old-name
executable file
·150 lines (120 loc) · 4.52 KB
/
vrt-old-name
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
#! /usr/bin/env python3
# -*- mode: Python; -*-
from argparse import ArgumentTypeError
from itertools import groupby, chain
import enum, os, re, sys, traceback
from vrtargslib import trans_args, trans_main
from vrtargslib import BadData, BadCode
from vrtnamelib import isxname, isbinnames, binmakenames
def spectype(text):
m = re.fullmatch(R'[1-9][0-9]*=(.+)', text)
if m and isxname(m.group(1)):
return text.encode('UTF-8')
raise ArgumentTypeError('malformed specification: {}'
.format(text))
def numtype(text):
if re.fullmatch(R'[1-9][0-9]*', text):
return int(text)
raise ArgumentTypeError('malformed number of fields: {}'
.format(text))
def parsearguments():
description = '''
Insert a positional-attributes comment at the beginning of a vrt
document to give names for the fields. Also replace any existing
positional-attributes comment. Fail if the number of fields does
not match the new names. (Default names are v1, v2, ...)
'''
parser = trans_args(description = description)
parser.add_argument('--number', '-n', metavar = 'num',
type = numtype, default = 1,
help = 'number of fields')
parser.add_argument('--position', '-k', metavar = 'k=name',
action = 'append',
type = spectype, default = [],
help = 'name for position k, 1-based')
args = parser.parse_args()
args.prog = parser.prog
return args
class Kind(enum.Enum):
# kind of line (group of them)
meta = 1
data = 2
begin = 3
names = 4
comment = 5
def identify(line):
# used by groupby to identify the kind of a line group
if line.startswith(b'<!--'):
if isbinnames(line): return Kind.names
else: return Kind.comment
if line.startswith(b'<sentence'): return Kind.begin
if line.startswith(b'<'): return Kind.meta
return Kind.data
def makenames(args):
positions = dict((int(k), name)
for spec in args.position
for k, name in [ spec.split(b'=') ])
number = max(args.number, max(positions, default = 1))
base = [ b'v' + str(k).encode('UTF-8') for k in range(1, number + 1) ]
for k, name in positions.items(): base[k - 1] = name
if len(set(base)) < len(base):
raise BadData('error: duplicate names')
line = binmakenames(*base)
return line, len(base)
def main(args, inf, ouf):
newnames, n = makenames(args)
status = 1
try:
implement_main(inf, ouf, newnames, n)
status = 0
except BadData as exn:
print(args.prog + ': ' + str(exn), file = sys.stderr)
except BadCode as exn:
print(args.prog + ': ' + str(exn), file = sys.stderr)
except BrokenPipeError:
print(args.prog + ': broken pipe in main', file = sys.stderr)
except Exception as exn:
print(traceback.format_exc(), file = sys.stderr)
return status
def implement_main(inf, ouf, newnames, n):
ouf.write(newnames)
first = True
found = False
def isnotspace(line): return not line.isspace()
for kind, group in groupby(filter(isnotspace, inf), identify):
if kind is not Kind.data: found = False
if kind is Kind.begin: found = True
if kind is Kind.data and found:
if first:
first, line = False, next(group)
if len(line.split(b'\t')) == n:
ouf.write(line)
else:
raise BadData('error: unexpected number of fields')
for line in group: ouf.write(line)
continue
if kind is Kind.begin:
for line in group: ouf.write(line)
continue
if kind is Kind.meta:
for line in group: ouf.write(line)
continue
if kind is Kind.data:
for line in group: ouf.write(line)
continue
if kind is Kind.names:
line = next(group)
if len(re.findall(br'[\w.+]+', line)) == n + 2:
ouf.write(newnames)
else:
raise BadData('error: unexpected number of names')
for line in group: ouf.write(newnames)
continue
if kind is Kind.comment:
for line in group: ouf.write(line)
continue
raise BadCode('this cannot happen')
if __name__ == '__main__':
trans_main(parsearguments(), main,
in_as_text = False,
out_as_text = False)