-
Notifications
You must be signed in to change notification settings - Fork 1
/
create_ls_corpus.py
240 lines (192 loc) · 10.3 KB
/
create_ls_corpus.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
# Create ReadyLingua Corpus
import argparse
import math
import re
import sys
from os import makedirs, walk
from os.path import exists, splitext, join, basename, pardir, abspath
import pandas as pd
from tabulate import tabulate
from tqdm import tqdm
from unidecode import unidecode
from constants import LS_RAW, LS_ROOT
from util.audio_util import seconds_to_frame, resample, crop_segments
from util.corpus_util import find_file_by_suffix
from util.log_util import create_args_str
from util.string_util import normalize, contains_numeric
parser = argparse.ArgumentParser(description="""Create LibriSpeech corpus from raw files""")
parser.add_argument('-f', '--file', help='Dummy argument for Jupyter Notebook compatibility')
parser.add_argument('-s', '--source', default=LS_RAW,
help=f'(optional) source root directory (default: {LS_RAW}')
parser.add_argument('-t', '--target', default=LS_ROOT,
help=f'(optional) target root directory (default: {LS_ROOT})')
parser.add_argument('-l', '--limit', type=int, default=None,
help='(optional) maximum number of corpus entries to process. Default=None=\'all\'')
parser.add_argument('-o', '--overwrite', default=False, action='store_true',
help='(optional) overwrite existing audio data if already present. If set to true this will '
'convert, resample and crop the audio data to a 16kHz mono WAV file which will prolong the'
'corpus creation process considerably. If set to false, the conversion of audio data will be'
'skipped, if the file is already present in the target directory and the corpus will only be'
'updated with the most current corpus entries. Default=False)')
args = parser.parse_args()
def main():
print(create_args_str(args))
print(f'Processing files from {args.source} and saving them in {args.target}')
corpus, corpus_file = create_corpus(args.source, args.target, args.limit)
print(f'Done! Corpus with {len(corpus)} entries saved to {corpus_file}')
def create_corpus(source_dir, target_dir, limit=None):
if not exists(source_dir):
print(f"ERROR: Source directory {source_dir} does not exist!")
exit(0)
if not exists(target_dir):
print(f'creating target directory {target_dir} as it does not exist yet')
makedirs(target_dir)
df = create_segments(source_dir=source_dir, target_dir=target_dir, limit=limit)
index_file = join(target_dir, 'index.csv')
df.to_csv(index_file)
return df, index_file
def create_segments(source_dir, target_dir, limit):
audio_root = join(source_dir, 'audio')
books_root = join(source_dir, 'books')
chapters_file = find_file_by_suffix(audio_root, 'CHAPTERS.TXT')
chapters = collect_chapter_meta(chapters_file)
books = collect_book_texts(books_root)
directories = [root for root, subdirs, files in walk(audio_root)
if not subdirs
and basename(root) in chapters.keys()][:limit]
progress = tqdm(directories, total=min(len(directories), limit or math.inf), file=sys.stderr, unit='entries')
segments = []
for source_dir in progress:
progress.set_description(f'{source_dir:{100}}')
chapter_id = basename(source_dir)
speaker_id = basename(abspath(join(source_dir, pardir)))
book_id = chapters[chapter_id]['book_id']
if chapter_id not in chapters:
print(f'WARNING: chapter {chapter_id} unknown or not in train-clean-xxx. Skipping corpus entry...')
continue
if not book_id:
print(f'WARNING: no book information available for chapter {chapter_id}. Skipping corpus entry...')
continue
if book_id not in books:
print(f'WARNING: no book text available for chapter {chapter_id}. Skipping corpus entry...')
continue
segments_file = find_file_by_suffix(source_dir, f'{speaker_id}-{chapter_id}.seg.txt')
if not segments_file:
print(f'no segmentation found at {segments_file}. Skipping corpus entry...')
continue
transcript_file = find_file_by_suffix(source_dir, f'{speaker_id}-{chapter_id}.trans.txt')
if not transcript_file:
print(f'no transcript found at {transcript_file}. Skipping corpus entry...')
continue
mp3_file = find_file_by_suffix(source_dir, f'{chapter_id}.mp3')
if not mp3_file:
print(f'no MP3 file found at {mp3_file}. Skipping corpus entry...')
continue
segment_infos = extract_segment_infos(segments_file, transcript_file)
crop_start, crop_end = crop_segments(segment_infos)
# resample audio if necessary
wav_file = join(target_dir, basename(splitext(mp3_file)[0] + ".wav"))
if not exists(wav_file) or args.overwrite:
resample(mp3_file, wav_file, crop_start, crop_end)
# write full transcript
with open(join(target_dir, f'{chapter_id}.txt'), 'w') as f:
book_text = normalize(books[book_id], 'en')
first_transcript = segment_infos[0]['transcript']
if first_transcript in book_text:
text_start = book_text.index(first_transcript)
else:
text_start = 0
# try to find the first transcript by searching the maximum substring from the left
for i in range(1, len(first_transcript) - 1):
if first_transcript[:i] not in book_text:
text_start = book_text.index(first_transcript[:i - 1])
break
last_transcript = segment_infos[-1]['transcript']
if last_transcript in book_text:
text_end = book_text.index(last_transcript) + len(last_transcript)
else:
# try to find last transcript by searching maximum substring from the right
text_end = len(book_text) - 1
for i in range(1, len(last_transcript) - 1):
if last_transcript[-i:] not in book_text:
text_end = book_text.index(last_transcript[-i + 1:]) + i - 1
break
f.write(book_text[text_start:text_end])
# create segments
for segment_info in segment_infos:
entry_id = chapter_id
subset = chapters[chapter_id]['subset']
audio_file = basename(wav_file)
start_frame = segment_info['start_frame']
end_frame = segment_info['end_frame']
transcript = segment_info['transcript']
duration = (end_frame - start_frame) / 16000
numeric = contains_numeric(transcript)
segments.append([entry_id, subset, 'en', audio_file, start_frame, end_frame, duration, transcript, numeric])
columns = ['entry_id', 'subset', 'language', 'audio_file', 'start_frame', 'end_frame', 'duration', 'transcript',
'numeric']
return pd.DataFrame(segments, columns=columns)
def collect_chapter_meta(chapters_file):
chapter_meta = {'unknown': 'unknown chapter'}
line_pattern = re.compile("(?P<chapter>\d+)\s*\|.*\|.*\|\s*(?P<subset>.*?)\s*\|.*\|\s*(?P<book>\d+)\s*\|.*\|.*")
with open(chapters_file) as f:
for line in (line for line in f.readlines() if not line.startswith(';')):
result = re.search(line_pattern, line)
subset = result.group('subset')
if '-clean' in subset:
chapter_id = result.group('chapter')
subset = subset.split('-clean')[0] # only keep train/dev/test
book_id = result.group('book')
chapter_meta[chapter_id] = {'subset': subset, 'book_id': book_id}
return chapter_meta
def collect_book_texts(books_root):
book_texts = {}
invalid_encodings = []
for root, files in tqdm([(root, files) for root, subdirs, files in walk(books_root)
if not subdirs and len(files) == 1], unit='books'):
book_file = join(root, files[0])
book_id = basename(root)
encoding = 'ascii' if 'ascii' in book_file else 'utf-8'
with open(book_file, 'r', encoding=encoding) as f:
try:
book_text = f.read().strip()
if len(book_text) > 0:
book_texts[book_id] = book_text
except UnicodeDecodeError as e:
invalid_encodings.append((book_id, book_file, encoding, e.start, e.end))
if invalid_encodings:
print(f'could not decode the following {len(invalid_encodings)} books because of decoding errors:')
print(tabulate(invalid_encodings, headers=['id', 'file', 'encoding', 'start', 'end']))
print('trying to fix those files by using Latin-1 encoding and removing invalid HTML markup')
for book_id, book_file, encoding, start, end in invalid_encodings:
with open(book_file, 'r', encoding='latin-1') as f:
book_text = f.read()
if '<pre>' in book_text:
book_text = book_text[:book_text.index('<pre>') + 5]
if '</pre>' in book_text:
book_text = book_text[:book_text.index('</pre>')]
with open(book_file, 'w', encoding='ascii') as f:
f.write(unidecode(book_text))
return book_texts
def extract_segment_infos(segments_file, transcript_file):
transcripts = {}
with open(transcript_file, 'r') as f_transcript:
for line in f_transcript.readlines():
segment_id, transcript = line.split(' ', 1)
transcripts[segment_id] = transcript.replace('\n', '')
line_pattern = re.compile('(?P<segment_id>.*)\s(?P<segment_start>.*)\s(?P<segment_end>.*)\n')
segment_infos = []
with open(segments_file, 'r') as f_segments:
lines = f_segments.readlines()
for i, line in enumerate(lines):
result = re.search(line_pattern, line)
if result:
segment_id = result.group('segment_id')
segment_infos.append({
'start_frame': seconds_to_frame(result.group('segment_start')),
'end_frame': seconds_to_frame(result.group('segment_end')),
'transcript': normalize(transcripts[segment_id], 'en') if segment_id in transcripts else ''
})
return segment_infos
if __name__ == '__main__':
main()