-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsimplify_slow.py
152 lines (123 loc) · 6.32 KB
/
simplify_slow.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
'''
An algorithm for merging word alignments in a way that produces the most compact
representation possible, such that for any selected span of the "from" text, the
resulting aligned span(s) in the "to" text remains the same, and vice versa.
Two alignments are eligible to be merged if:
- One is entirely contained within the other on both sides; or
- Both point to the same span on one side and are overlapping, adjacent, or
separated by only whitespace on the other.
WARNING: The time complexity of this algorithm is O(scary). Even on a fairly
powerful machine, a non-trivial aligned sentence pair took 4.5 min to simplify.
I attempted to reduce the number of iterations necessary by breaking the problem
into smaller sub-graphs of intersecting alignments but couldn't get acceptable
results. There may be a way to optimize this, but I've spent too much time on it
already, so I've instead opted for a simpler, repeated-forward-pass approach
which, while it doesn't produce the optimal result in all cases, is good enough
for real-world sentence pairs -- and more importantly is 8000x faster.
Examples:
a-ab b-ab c-ab ab-c c-abc a-ab a-bc b-bc c-bc d-bc b-abc
ab-ab c-ab ab-c c-abc a-abc b-abc b-bc c-bc d-bc
a-ab bc-ab ab-c c-abc a-ab ab-bc b-abc c-bc d-bc
a-ab b-ab ab-c c-abc a-ab a-bc b-abc bc-bc d-bc
a-ab a-bc b-abc c-bc d-bc
abc-ab ab-c c-abc a-ab a-bc b-abc b-bc cd-bc
ab-abc c-ab c-abc
ab-ab c-abc ab-c ab-abc b-bc c-bc d-bc
a-abc b-abc bc-bc d-bc
abc-abc c-ab a-abc b-abc c-bc d-bc
ab-abc c-abc a-abc b-abc b-bc cd-bc
a-ab abc-bc b-abc d-bc
abc-abc a-ab a-bc b-abc bcd-bc
a-ab a-bc b-abc cd-bc
a-ab ab-bc b-abc cd-bc
ab-abc bc-bc d-bc
a-abc b-abc bcd-bc
ab-abc c-bc d-bc
a-abc b-abc cd-bc
ab-abc b-bc cd-bc
a-ab abcd-bc b-abc
ab-abc bcd-bc
ab-abc cd-bc
Running those examples from the REPL (uncomment the print statements below):
simplify([0,1,0,2,1,2,0,2,2,3,0,2,0,2,2,3,2,3,0,3], 'abcd', 'abcd')
simplify([0,1,0,2,0,1,1,3,1,2,1,3,2,3,1,3,3,4,1,3,1,2,0,3], 'abcd', 'abcd')
'''
from collections import deque
import re
def simplify(alignments: list[int], from_text: str, to_text: str) -> list[int]:
best = group_alignments(alignments)
queue = deque([best])
# Perform a breadth-first search to look for the most compact representation
# “I'm sure this is fine, it's only, like, O(n!ⁿꜝ).” --Famous last words
while len(queue) > 0:
current = queue.popleft()
# print(_debug(current, from_text, to_text))
# Iterate over every combination of two alignment groups
for i, left in enumerate(current[:len(current) - 1]):
for right in current[i + 1:]:
# See if these groups can be merged
merged = merge_alignments(left, right, from_text, to_text)
if merged is None:
continue
# Replace the two groups with the merged one
simplified = [*(x for x in current if x != left and x != right), merged]
simplified.sort()
# Skip if already in the queue
if simplified in queue:
continue
# Check if the new alignments is better
if compare(simplified, best) < 0:
best = simplified
# Even if not, we may still get a better result continuing down this
# path than with the current-best
queue.append(simplified)
# print(' ' + _debug(simplified, from_text, to_text))
return ungroup_alignments(best)
def merge_alignments(
left: tuple[int, int, int, int],
right: tuple[int, int, int, int],
from_text: str,
to_text: str) -> tuple[int, int, int, int] | None:
# Check if left is entirely contained within right
if left[0] >= right[0] and left[1] <= right[1] and \
left[2] >= right[2] and left[3] <= right[3]:
return right
# Check if right is entirely contained within left
if right[0] >= left[0] and right[1] <= left[1] and \
right[2] >= left[2] and right[3] <= left[3]:
return left
# Check if 'from' is the same and 'to' is overlapping/adjacent
if left[0] == right[0] and left[1] == right[1] and \
is_overlapping_or_adjacent(left[2], left[3], right[2], right[3], to_text):
return (left[0], left[1], min(left[2], right[2]), max(left[3], right[3]))
# Check if 'to' is the same and 'from' is overlapping/adjacent
if left[2] == right[2] and left[3] == right[3] and \
is_overlapping_or_adjacent(left[0], left[1], right[0], right[1], from_text):
return (min(left[0], right[0]), max(left[1], right[1]), left[2], left[3])
return None
def is_overlapping_or_adjacent(start1: int, end1: int, start2: int, end2: int, text: str) -> bool:
'''
Returns true if [start1,end1) overlaps with, touches, or is separated only by
whitespace from [start2,end2).
'''
return (end1 < start2 and not re.search(r'\S', text[end1:start2])) or \
(start1 <= end2 and end1 >= start2) or \
(start1 > end2 and not re.search(r'\S', text[end2:start1]))
def compare(a: list[tuple[int, int, int, int]], b: list[tuple[int, int, int, int]]) -> int:
if len(a) < len(b):
return -1
if len(a) > len(b):
return 1
# Prefer shorter spans, just for cleaner visualizations
aSpanLen = sum((x[1] - x[0]) + (x[3] - x[2]) for x in a)
bSpanLen = sum((x[1] - x[0]) + (x[3] - x[2]) for x in b)
return aSpanLen - bSpanLen
def group_alignments(alignments: list[int]) -> list[tuple[int, int, int, int]]:
return [tuple(alignments[i*4:(i+1)*4]) for i in range(len(alignments)//4)]
def ungroup_alignments(alignments: list[tuple[int, int, int, int]]) -> list[int]:
return [x for group in alignments for x in group]
def _debug(alignments: list[tuple[int, int, int, int]], from_text: str, to_text: str) -> str:
return ' '.join([
f'{from_text[from_start:from_end]}-{to_text[to_start:to_end]}'
for [from_start, from_end, to_start, to_end] in alignments
])