-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtext.py
executable file
·197 lines (150 loc) · 7.3 KB
/
text.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
import numpy as np
import unicodedata
import codecs
import re
import tensorflow as tf
# Constants
SPACE_TOKEN = '<space>'
SPACE_INDEX = 0
FIRST_INDEX = ord('a') - 1 # 0 is reserved to space
def normalize_txt_file(txt_file, remove_apostrophe=True):
"""
Given a path to a text file, return contents with unsupported characters removed.
"""
with codecs.open(txt_file, encoding="utf-8") as open_txt_file:
return normalize_text(open_txt_file.read(), remove_apostrophe=remove_apostrophe)
def normalize_text(original, remove_apostrophe=True):
"""
Given a Python string ``original``, remove unsupported characters.
The only supported characters are letters and apostrophes.
"""
# convert any unicode characters to ASCII equivalent
# then ignore anything else and decode to a string
result = unicodedata.normalize("NFKD", original).encode("ascii", "ignore").decode()
if remove_apostrophe:
# remove apostrophes to keep contractions together
result = result.replace("'", "")
# return lowercase alphabetic characters and apostrophes (if still present)
return re.sub("[^a-zA-Z']+", ' ', result).strip().lower()
def text_to_char_array(original):
"""
Given a Python string ``original``, map characters
to integers and return a numpy array representing the processed string.
This function has been modified from Mozilla DeepSpeech:
https://github.com/mozilla/DeepSpeech/blob/master/util/text.py
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
"""
# Create list of sentence's words w/spaces replaced by ''
result = original.replace(' ', ' ')
result = result.split(' ')
# Tokenize words into letters adding in SPACE_TOKEN where required
result = np.hstack([SPACE_TOKEN if xt == '' else list(xt) for xt in result])
# Return characters mapped into indicies
return np.asarray([SPACE_INDEX if xt == SPACE_TOKEN else ord(xt) - FIRST_INDEX for xt in result])
def sparse_tuple_from(sequences, dtype=np.int32):
"""
Create a sparse representention of ``sequences``.
Args:
sequences: a list of lists of type dtype where each element is a sequence
Returns:
A tuple with (indices, values, shape)
This function has been modified from Mozilla DeepSpeech:
https://github.com/mozilla/DeepSpeech/blob/master/util/text.py
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
"""
indices = []
values = []
for n, seq in enumerate(sequences):
indices.extend(zip([n] * len(seq), range(len(seq))))
values.extend(seq)
indices = np.asarray(indices, dtype=np.int64)
values = np.asarray(values, dtype=dtype)
shape = np.asarray([len(sequences), indices.max(0)[1] + 1], dtype=np.int64)
# return tf.SparseTensor(indices=indices, values=values, shape=shape)
return indices, values, shape
def sparse_tensor_value_to_texts(value):
"""
Given a :class:`tf.SparseTensor` ``value``, return an array of Python strings
representing its values.
This function has been modified from Mozilla DeepSpeech:
https://github.com/mozilla/DeepSpeech/blob/master/util/text.py
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
"""
return sparse_tuple_to_texts((value.indices, value.values, value.dense_shape))
def sparse_tuple_to_texts(tuple):
'''
This function has been modified from Mozilla DeepSpeech:
https://github.com/mozilla/DeepSpeech/blob/master/util/text.py
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
'''
indices = tuple[0]
values = tuple[1]
results = [''] * tuple[2][0]
for i in range(len(indices)):
index = indices[i][0]
c = values[i]
c = ' ' if c == SPACE_INDEX else chr(c + FIRST_INDEX)
results[index] = results[index] + c
# List of strings
return results
def ndarray_to_text(value):
'''
This function has been modified from Mozilla DeepSpeech:
https://github.com/mozilla/DeepSpeech/blob/master/util/text.py
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
'''
results = ''
for i in range(len(value)):
results += chr(value[i] + FIRST_INDEX)
return results.replace('`', ' ')
def gather_nd(params, indices, shape):
'''
# Function aken from https://github.com/tensorflow/tensorflow/issues/206#issuecomment-229678962
'''
rank = len(shape)
flat_params = tf.reshape(params, [-1])
multipliers = [reduce(lambda x, y: x * y, shape[i + 1:], 1) for i in range(0, rank)]
indices_unpacked = tf.unstack(tf.transpose(indices, [rank - 1] + range(0, rank - 1)))
flat_indices = sum([a * b for a, b in zip(multipliers, indices_unpacked)])
return tf.gather(flat_params, flat_indices)
def ctc_label_dense_to_sparse(labels, label_lengths, batch_size):
'''
The CTC implementation in TensorFlow needs labels in a sparse representation,
but sparse data and queues don't mix well, so we store padded tensors in the
queue and convert to a sparse representation after dequeuing a batch.
Taken from https://github.com/tensorflow/tensorflow/issues/1742#issuecomment-205291527
'''
# The second dimension of labels must be equal to the longest label length in the batch
correct_shape_assert = tf.assert_equal(tf.shape(labels)[1], tf.reduce_max(label_lengths))
with tf.control_dependencies([correct_shape_assert]):
labels = tf.identity(labels)
label_shape = tf.shape(labels)
num_batches_tns = tf.stack([label_shape[0]])
max_num_labels_tns = tf.stack([label_shape[1]])
def range_less_than(previous_state, current_input):
return tf.expand_dims(tf.range(label_shape[1]), 0) < current_input
init = tf.cast(tf.fill(max_num_labels_tns, 0), tf.bool)
init = tf.expand_dims(init, 0)
dense_mask = tf.scan(range_less_than, label_lengths, initializer=init, parallel_iterations=1)
dense_mask = dense_mask[:, 0, :]
label_array = tf.reshape(tf.tile(tf.range(0, label_shape[1]), num_batches_tns), label_shape)
label_ind = tf.boolean_mask(label_array, dense_mask)
batch_array = tf.transpose(tf.reshape(tf.tile(tf.range(0, label_shape[0]), max_num_labels_tns),
tf.reverse(label_shape, [0]))
)
batch_ind = tf.boolean_mask(batch_array, dense_mask)
batch_label = tf.concat([batch_ind, label_ind], 0)
indices = tf.transpose(tf.reshape(batch_label, [2, -1]))
shape = [batch_size, tf.reduce_max(label_lengths)]
vals_sparse = gather_nd(labels, indices, shape)
return tf.SparseTensor(tf.to_int64(indices), vals_sparse, tf.to_int64(label_shape))