forked from udieckmann/Kielipankki-utilities
-
Notifications
You must be signed in to change notification settings - Fork 0
/
vrt-scramble.py
executable file
·102 lines (88 loc) · 3.65 KB
/
vrt-scramble.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
#! /usr/bin/env python2
# -*- coding: utf-8 -*-
"""Scramble (randomly shuffle) structures in a VRT input."""
import sys
import re
import random
import korpimport.util
class VrtScrambler(korpimport.util.InputProcessor):
def __init__(self):
super(VrtScrambler, self).__init__()
self._scramble_units = []
self._rnd = random.Random(self._opts.random_seed)
def process_input_stream(self, stream, filename=None):
within_begin_re = re.compile(
ur'<' + self._opts.scramble_within + '[>\s]')
scramble_begin_re = re.compile(
ur'<' + self._opts.scramble_unit + '[>\s]')
scramble_end = '</' + self._opts.scramble_within + '>'
collecting = False
units = []
current_unit = []
for line in stream:
self._linenr += 1
if collecting:
if line.startswith(scramble_end):
if current_unit:
units.append(current_unit)
collecting = False
for line2 in self._scramble(units):
sys.stdout.write(line2)
sys.stdout.write(line)
elif scramble_begin_re.match(line):
if current_unit:
units.append(current_unit)
current_unit = [line]
elif line.startswith('<') and current_unit == []:
mo = re.match(r'<([a-z_0-9]+)', line)
struct = ''
if mo:
struct = mo.group(1)
self.error('Structure \'' + struct + '\' between \''
+ self._opts.scramble_within + '\' and \''
+ self._opts.scramble_unit + '\'')
else:
current_unit.append(line)
else:
sys.stdout.write(line)
if within_begin_re.match(line):
units = []
current_unit = []
collecting = True
def _scramble(self, units):
self._rnd.shuffle(units)
for unit in units:
for line in unit:
yield line
def getopts(self, args=None):
self.getopts_basic(
dict(usage="%prog [options] [input] > output",
description=(
"""Scramble (randomly shuffle) given structures (elements), such as sentences,
within larger structures, such as texts, in the VRT input and output the
scrambled VRT.
Note that the input may not have intermediate structures between the
containing structures and the structures to be scrambled; for example, if
sentences are scrambled within texts, the input may not have paragraphs.
""")
),
args,
['scramble-unit|unit =STRUCT', dict(
default='sentence',
help=('shuffle STRUCT structures (elements)'
' (default: %default)'))],
['scramble-within|within =STRUCT', dict(
default='text',
help=('shuffle structures within STRUCT structures (elements):'
' structures are not moved across STRUCT boundaries'
' (default: %default)'))],
['random-seed|seed =SEED', dict(
default='2017',
help=('set random number generator seed to SEED (any string);'
' use 0 or "" for a random seed (non-reproducible'
' output) (default: %default)'))],
)
if self._opts.random_seed in ['', '0']:
self._opts.random_seed = None
if __name__ == "__main__":
VrtScrambler().run()